{ "best_global_step": 43641, "best_metric": 0.8025602698326111, "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_math_qa_1754652176/checkpoint-43641", "epoch": 10.0, "eval_steps": 3357, "global_step": 67140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007447125409591898, "grad_norm": 3.038577079772949, "learning_rate": 2.978850163836759e-08, "loss": 11.8795, "num_input_tokens_seen": 2976, "step": 5 }, { "epoch": 0.0014894250819183796, "grad_norm": 2.0987415313720703, "learning_rate": 6.702412868632709e-08, "loss": 11.9955, "num_input_tokens_seen": 5920, "step": 10 }, { "epoch": 0.002234137622877569, "grad_norm": 2.3021867275238037, "learning_rate": 1.0425975573428657e-07, "loss": 12.1827, "num_input_tokens_seen": 8832, "step": 15 }, { "epoch": 0.002978850163836759, "grad_norm": 2.754102945327759, "learning_rate": 1.4149538278224606e-07, "loss": 11.8842, "num_input_tokens_seen": 11648, "step": 20 }, { "epoch": 0.0037235627047959487, "grad_norm": 2.442565679550171, "learning_rate": 1.7873100983020555e-07, "loss": 12.2012, "num_input_tokens_seen": 14368, "step": 25 }, { "epoch": 0.004468275245755138, "grad_norm": 2.2573490142822266, "learning_rate": 2.1596663687816505e-07, "loss": 12.0618, "num_input_tokens_seen": 17280, "step": 30 }, { "epoch": 0.005212987786714328, "grad_norm": 2.436063289642334, "learning_rate": 2.532022639261245e-07, "loss": 12.2948, "num_input_tokens_seen": 20160, "step": 35 }, { "epoch": 0.005957700327673518, "grad_norm": 2.418266534805298, "learning_rate": 2.90437890974084e-07, "loss": 11.9709, "num_input_tokens_seen": 23104, "step": 40 }, { "epoch": 0.006702412868632708, "grad_norm": 2.4870247840881348, "learning_rate": 3.276735180220435e-07, "loss": 11.8904, "num_input_tokens_seen": 26048, "step": 45 }, { "epoch": 0.0074471254095918975, "grad_norm": 2.7109293937683105, "learning_rate": 3.64909145070003e-07, "loss": 11.9421, "num_input_tokens_seen": 28928, "step": 50 }, { "epoch": 0.008191837950551088, "grad_norm": 2.045679807662964, "learning_rate": 4.021447721179625e-07, "loss": 11.9304, "num_input_tokens_seen": 32256, "step": 55 }, { "epoch": 0.008936550491510277, "grad_norm": 2.2266690731048584, "learning_rate": 4.3938039916592203e-07, "loss": 12.0341, "num_input_tokens_seen": 35232, "step": 60 }, { "epoch": 0.009681263032469467, "grad_norm": 2.4481873512268066, "learning_rate": 4.7661602621388146e-07, "loss": 12.0816, "num_input_tokens_seen": 38112, "step": 65 }, { "epoch": 0.010425975573428656, "grad_norm": 2.511849880218506, "learning_rate": 5.13851653261841e-07, "loss": 11.5811, "num_input_tokens_seen": 41120, "step": 70 }, { "epoch": 0.011170688114387846, "grad_norm": 2.5591235160827637, "learning_rate": 5.510872803098004e-07, "loss": 11.8222, "num_input_tokens_seen": 44064, "step": 75 }, { "epoch": 0.011915400655347037, "grad_norm": 2.3036928176879883, "learning_rate": 5.8832290735776e-07, "loss": 12.2264, "num_input_tokens_seen": 46592, "step": 80 }, { "epoch": 0.012660113196306225, "grad_norm": 2.4396865367889404, "learning_rate": 6.255585344057195e-07, "loss": 11.7415, "num_input_tokens_seen": 49152, "step": 85 }, { "epoch": 0.013404825737265416, "grad_norm": 2.2426133155822754, "learning_rate": 6.627941614536789e-07, "loss": 11.9119, "num_input_tokens_seen": 51776, "step": 90 }, { "epoch": 0.014149538278224605, "grad_norm": 2.2078235149383545, "learning_rate": 7.000297885016385e-07, "loss": 11.9932, "num_input_tokens_seen": 54624, "step": 95 }, { "epoch": 0.014894250819183795, "grad_norm": 2.4721386432647705, "learning_rate": 7.372654155495979e-07, "loss": 12.0358, "num_input_tokens_seen": 57312, "step": 100 }, { "epoch": 0.015638963360142984, "grad_norm": 3.1662707328796387, "learning_rate": 7.745010425975574e-07, "loss": 12.0821, "num_input_tokens_seen": 60064, "step": 105 }, { "epoch": 0.016383675901102176, "grad_norm": 2.3638813495635986, "learning_rate": 8.11736669645517e-07, "loss": 12.1645, "num_input_tokens_seen": 63424, "step": 110 }, { "epoch": 0.017128388442061365, "grad_norm": 2.6412901878356934, "learning_rate": 8.489722966934764e-07, "loss": 12.069, "num_input_tokens_seen": 66080, "step": 115 }, { "epoch": 0.017873100983020553, "grad_norm": 2.3333370685577393, "learning_rate": 8.862079237414358e-07, "loss": 11.8895, "num_input_tokens_seen": 69440, "step": 120 }, { "epoch": 0.018617813523979745, "grad_norm": 2.308305501937866, "learning_rate": 9.234435507893953e-07, "loss": 11.8079, "num_input_tokens_seen": 72320, "step": 125 }, { "epoch": 0.019362526064938934, "grad_norm": 2.2811708450317383, "learning_rate": 9.606791778373549e-07, "loss": 12.3874, "num_input_tokens_seen": 75168, "step": 130 }, { "epoch": 0.020107238605898123, "grad_norm": 2.9698078632354736, "learning_rate": 9.979148048853143e-07, "loss": 11.8574, "num_input_tokens_seen": 78176, "step": 135 }, { "epoch": 0.02085195114685731, "grad_norm": 2.2576522827148438, "learning_rate": 1.035150431933274e-06, "loss": 12.0448, "num_input_tokens_seen": 81248, "step": 140 }, { "epoch": 0.021596663687816504, "grad_norm": 2.3547327518463135, "learning_rate": 1.0723860589812334e-06, "loss": 11.9945, "num_input_tokens_seen": 84416, "step": 145 }, { "epoch": 0.022341376228775692, "grad_norm": 2.111647129058838, "learning_rate": 1.1096216860291928e-06, "loss": 11.9378, "num_input_tokens_seen": 87584, "step": 150 }, { "epoch": 0.02308608876973488, "grad_norm": 2.3381199836730957, "learning_rate": 1.1468573130771522e-06, "loss": 12.0914, "num_input_tokens_seen": 90368, "step": 155 }, { "epoch": 0.023830801310694073, "grad_norm": 2.3707849979400635, "learning_rate": 1.1840929401251119e-06, "loss": 11.9843, "num_input_tokens_seen": 93152, "step": 160 }, { "epoch": 0.024575513851653262, "grad_norm": 2.224364757537842, "learning_rate": 1.2213285671730713e-06, "loss": 11.7642, "num_input_tokens_seen": 96384, "step": 165 }, { "epoch": 0.02532022639261245, "grad_norm": 2.2933475971221924, "learning_rate": 1.2585641942210307e-06, "loss": 11.5938, "num_input_tokens_seen": 99136, "step": 170 }, { "epoch": 0.026064938933571643, "grad_norm": 2.457611560821533, "learning_rate": 1.2957998212689904e-06, "loss": 12.0167, "num_input_tokens_seen": 101824, "step": 175 }, { "epoch": 0.02680965147453083, "grad_norm": 2.7374699115753174, "learning_rate": 1.3330354483169498e-06, "loss": 11.9466, "num_input_tokens_seen": 104832, "step": 180 }, { "epoch": 0.02755436401549002, "grad_norm": 2.189600944519043, "learning_rate": 1.3702710753649092e-06, "loss": 12.0045, "num_input_tokens_seen": 107680, "step": 185 }, { "epoch": 0.02829907655644921, "grad_norm": 2.210780382156372, "learning_rate": 1.4075067024128687e-06, "loss": 11.9431, "num_input_tokens_seen": 110560, "step": 190 }, { "epoch": 0.0290437890974084, "grad_norm": 2.2957558631896973, "learning_rate": 1.4447423294608283e-06, "loss": 11.8927, "num_input_tokens_seen": 113696, "step": 195 }, { "epoch": 0.02978850163836759, "grad_norm": 2.1968038082122803, "learning_rate": 1.4819779565087877e-06, "loss": 11.9634, "num_input_tokens_seen": 116480, "step": 200 }, { "epoch": 0.03053321417932678, "grad_norm": 2.5095291137695312, "learning_rate": 1.5192135835567472e-06, "loss": 11.8068, "num_input_tokens_seen": 119712, "step": 205 }, { "epoch": 0.03127792672028597, "grad_norm": 2.1350576877593994, "learning_rate": 1.5564492106047066e-06, "loss": 11.6063, "num_input_tokens_seen": 122944, "step": 210 }, { "epoch": 0.032022639261245156, "grad_norm": 2.231414556503296, "learning_rate": 1.593684837652666e-06, "loss": 11.9658, "num_input_tokens_seen": 125984, "step": 215 }, { "epoch": 0.03276735180220435, "grad_norm": 2.278703212738037, "learning_rate": 1.6309204647006257e-06, "loss": 11.7791, "num_input_tokens_seen": 129216, "step": 220 }, { "epoch": 0.03351206434316354, "grad_norm": 2.2274439334869385, "learning_rate": 1.668156091748585e-06, "loss": 11.7954, "num_input_tokens_seen": 131808, "step": 225 }, { "epoch": 0.03425677688412273, "grad_norm": 2.1787869930267334, "learning_rate": 1.7053917187965447e-06, "loss": 11.7746, "num_input_tokens_seen": 135040, "step": 230 }, { "epoch": 0.03500148942508192, "grad_norm": 2.3129770755767822, "learning_rate": 1.7426273458445042e-06, "loss": 11.8909, "num_input_tokens_seen": 137792, "step": 235 }, { "epoch": 0.035746201966041107, "grad_norm": 2.055553436279297, "learning_rate": 1.7798629728924636e-06, "loss": 11.5593, "num_input_tokens_seen": 140800, "step": 240 }, { "epoch": 0.036490914507000295, "grad_norm": 2.2647511959075928, "learning_rate": 1.817098599940423e-06, "loss": 11.6352, "num_input_tokens_seen": 143616, "step": 245 }, { "epoch": 0.03723562704795949, "grad_norm": 2.032160520553589, "learning_rate": 1.8543342269883825e-06, "loss": 11.5901, "num_input_tokens_seen": 146592, "step": 250 }, { "epoch": 0.03798033958891868, "grad_norm": 2.0666050910949707, "learning_rate": 1.8915698540363419e-06, "loss": 11.7638, "num_input_tokens_seen": 149376, "step": 255 }, { "epoch": 0.03872505212987787, "grad_norm": 2.4358747005462646, "learning_rate": 1.9288054810843013e-06, "loss": 11.9027, "num_input_tokens_seen": 152256, "step": 260 }, { "epoch": 0.03946976467083706, "grad_norm": 2.3036386966705322, "learning_rate": 1.966041108132261e-06, "loss": 11.7739, "num_input_tokens_seen": 155168, "step": 265 }, { "epoch": 0.040214477211796246, "grad_norm": 2.0369021892547607, "learning_rate": 2.0032767351802206e-06, "loss": 11.7754, "num_input_tokens_seen": 158016, "step": 270 }, { "epoch": 0.040959189752755434, "grad_norm": 2.7754688262939453, "learning_rate": 2.04051236222818e-06, "loss": 12.0539, "num_input_tokens_seen": 160992, "step": 275 }, { "epoch": 0.04170390229371462, "grad_norm": 2.214980363845825, "learning_rate": 2.0777479892761395e-06, "loss": 11.5101, "num_input_tokens_seen": 163616, "step": 280 }, { "epoch": 0.04244861483467382, "grad_norm": 2.054042339324951, "learning_rate": 2.114983616324099e-06, "loss": 11.7727, "num_input_tokens_seen": 166496, "step": 285 }, { "epoch": 0.04319332737563301, "grad_norm": 2.65145206451416, "learning_rate": 2.1522192433720583e-06, "loss": 11.6312, "num_input_tokens_seen": 169248, "step": 290 }, { "epoch": 0.043938039916592196, "grad_norm": 2.2317593097686768, "learning_rate": 2.1894548704200177e-06, "loss": 11.5483, "num_input_tokens_seen": 172160, "step": 295 }, { "epoch": 0.044682752457551385, "grad_norm": 2.1837639808654785, "learning_rate": 2.2266904974679776e-06, "loss": 11.7643, "num_input_tokens_seen": 174880, "step": 300 }, { "epoch": 0.045427464998510574, "grad_norm": 2.0390024185180664, "learning_rate": 2.263926124515937e-06, "loss": 11.8734, "num_input_tokens_seen": 177536, "step": 305 }, { "epoch": 0.04617217753946976, "grad_norm": 2.2879624366760254, "learning_rate": 2.3011617515638965e-06, "loss": 11.3407, "num_input_tokens_seen": 180384, "step": 310 }, { "epoch": 0.04691689008042895, "grad_norm": 2.321617603302002, "learning_rate": 2.338397378611856e-06, "loss": 11.3299, "num_input_tokens_seen": 183104, "step": 315 }, { "epoch": 0.04766160262138815, "grad_norm": 2.259108543395996, "learning_rate": 2.3756330056598153e-06, "loss": 11.8681, "num_input_tokens_seen": 185792, "step": 320 }, { "epoch": 0.048406315162347335, "grad_norm": 2.1209421157836914, "learning_rate": 2.4128686327077747e-06, "loss": 11.5637, "num_input_tokens_seen": 188640, "step": 325 }, { "epoch": 0.049151027703306524, "grad_norm": 1.9656914472579956, "learning_rate": 2.4501042597557346e-06, "loss": 11.5898, "num_input_tokens_seen": 191328, "step": 330 }, { "epoch": 0.04989574024426571, "grad_norm": 2.3816967010498047, "learning_rate": 2.487339886803694e-06, "loss": 11.4413, "num_input_tokens_seen": 194464, "step": 335 }, { "epoch": 0.0506404527852249, "grad_norm": 2.3446195125579834, "learning_rate": 2.5245755138516535e-06, "loss": 11.8396, "num_input_tokens_seen": 196960, "step": 340 }, { "epoch": 0.05138516532618409, "grad_norm": 1.9140510559082031, "learning_rate": 2.561811140899613e-06, "loss": 11.871, "num_input_tokens_seen": 199840, "step": 345 }, { "epoch": 0.052129877867143286, "grad_norm": 2.214324951171875, "learning_rate": 2.5990467679475723e-06, "loss": 11.5578, "num_input_tokens_seen": 202560, "step": 350 }, { "epoch": 0.052874590408102475, "grad_norm": 2.247530460357666, "learning_rate": 2.6362823949955317e-06, "loss": 11.6567, "num_input_tokens_seen": 205600, "step": 355 }, { "epoch": 0.05361930294906166, "grad_norm": 2.0697758197784424, "learning_rate": 2.673518022043491e-06, "loss": 11.5071, "num_input_tokens_seen": 208672, "step": 360 }, { "epoch": 0.05436401549002085, "grad_norm": 2.1091668605804443, "learning_rate": 2.710753649091451e-06, "loss": 11.3948, "num_input_tokens_seen": 211520, "step": 365 }, { "epoch": 0.05510872803098004, "grad_norm": 2.2571914196014404, "learning_rate": 2.7479892761394105e-06, "loss": 11.5754, "num_input_tokens_seen": 214496, "step": 370 }, { "epoch": 0.05585344057193923, "grad_norm": 2.039344549179077, "learning_rate": 2.78522490318737e-06, "loss": 11.3605, "num_input_tokens_seen": 217216, "step": 375 }, { "epoch": 0.05659815311289842, "grad_norm": 2.3484091758728027, "learning_rate": 2.8224605302353293e-06, "loss": 11.4383, "num_input_tokens_seen": 220000, "step": 380 }, { "epoch": 0.057342865653857614, "grad_norm": 2.147984027862549, "learning_rate": 2.8596961572832887e-06, "loss": 11.3111, "num_input_tokens_seen": 222816, "step": 385 }, { "epoch": 0.0580875781948168, "grad_norm": 2.8211581707000732, "learning_rate": 2.896931784331248e-06, "loss": 11.3558, "num_input_tokens_seen": 225664, "step": 390 }, { "epoch": 0.05883229073577599, "grad_norm": 2.410921096801758, "learning_rate": 2.9341674113792076e-06, "loss": 11.2352, "num_input_tokens_seen": 228800, "step": 395 }, { "epoch": 0.05957700327673518, "grad_norm": 1.996368408203125, "learning_rate": 2.9714030384271675e-06, "loss": 11.3296, "num_input_tokens_seen": 232064, "step": 400 }, { "epoch": 0.06032171581769437, "grad_norm": 2.0263571739196777, "learning_rate": 3.008638665475127e-06, "loss": 11.5715, "num_input_tokens_seen": 234880, "step": 405 }, { "epoch": 0.06106642835865356, "grad_norm": 2.2315497398376465, "learning_rate": 3.0458742925230863e-06, "loss": 11.2985, "num_input_tokens_seen": 237856, "step": 410 }, { "epoch": 0.06181114089961275, "grad_norm": 1.9646835327148438, "learning_rate": 3.0831099195710457e-06, "loss": 11.1372, "num_input_tokens_seen": 240608, "step": 415 }, { "epoch": 0.06255585344057193, "grad_norm": 2.099736213684082, "learning_rate": 3.120345546619005e-06, "loss": 11.1533, "num_input_tokens_seen": 243168, "step": 420 }, { "epoch": 0.06330056598153112, "grad_norm": 1.982100248336792, "learning_rate": 3.1575811736669646e-06, "loss": 11.2868, "num_input_tokens_seen": 246304, "step": 425 }, { "epoch": 0.06404527852249031, "grad_norm": 2.3176445960998535, "learning_rate": 3.194816800714924e-06, "loss": 11.4806, "num_input_tokens_seen": 249024, "step": 430 }, { "epoch": 0.06478999106344951, "grad_norm": 2.1351895332336426, "learning_rate": 3.2320524277628835e-06, "loss": 11.0132, "num_input_tokens_seen": 252224, "step": 435 }, { "epoch": 0.0655347036044087, "grad_norm": 1.9889116287231445, "learning_rate": 3.269288054810843e-06, "loss": 11.4024, "num_input_tokens_seen": 254848, "step": 440 }, { "epoch": 0.06627941614536789, "grad_norm": 1.9773496389389038, "learning_rate": 3.3065236818588023e-06, "loss": 11.321, "num_input_tokens_seen": 257760, "step": 445 }, { "epoch": 0.06702412868632708, "grad_norm": 1.8294379711151123, "learning_rate": 3.3437593089067626e-06, "loss": 11.2019, "num_input_tokens_seen": 260384, "step": 450 }, { "epoch": 0.06776884122728627, "grad_norm": 2.220461130142212, "learning_rate": 3.380994935954722e-06, "loss": 10.984, "num_input_tokens_seen": 263360, "step": 455 }, { "epoch": 0.06851355376824546, "grad_norm": 2.0433566570281982, "learning_rate": 3.4182305630026814e-06, "loss": 11.0445, "num_input_tokens_seen": 266112, "step": 460 }, { "epoch": 0.06925826630920465, "grad_norm": 2.142548084259033, "learning_rate": 3.455466190050641e-06, "loss": 11.2099, "num_input_tokens_seen": 268672, "step": 465 }, { "epoch": 0.07000297885016384, "grad_norm": 2.0808048248291016, "learning_rate": 3.4927018170986003e-06, "loss": 11.0895, "num_input_tokens_seen": 271328, "step": 470 }, { "epoch": 0.07074769139112302, "grad_norm": 2.2481720447540283, "learning_rate": 3.5299374441465597e-06, "loss": 10.9189, "num_input_tokens_seen": 274144, "step": 475 }, { "epoch": 0.07149240393208221, "grad_norm": 2.0031166076660156, "learning_rate": 3.567173071194519e-06, "loss": 10.7885, "num_input_tokens_seen": 277024, "step": 480 }, { "epoch": 0.0722371164730414, "grad_norm": 1.9143702983856201, "learning_rate": 3.6044086982424786e-06, "loss": 11.2372, "num_input_tokens_seen": 279840, "step": 485 }, { "epoch": 0.07298182901400059, "grad_norm": 2.070237874984741, "learning_rate": 3.641644325290438e-06, "loss": 10.9862, "num_input_tokens_seen": 283200, "step": 490 }, { "epoch": 0.07372654155495978, "grad_norm": 2.1270477771759033, "learning_rate": 3.6788799523383975e-06, "loss": 11.078, "num_input_tokens_seen": 286016, "step": 495 }, { "epoch": 0.07447125409591898, "grad_norm": 2.2862653732299805, "learning_rate": 3.716115579386357e-06, "loss": 11.0173, "num_input_tokens_seen": 288864, "step": 500 }, { "epoch": 0.07521596663687817, "grad_norm": 2.032426118850708, "learning_rate": 3.7533512064343163e-06, "loss": 10.8477, "num_input_tokens_seen": 291904, "step": 505 }, { "epoch": 0.07596067917783736, "grad_norm": 2.220597267150879, "learning_rate": 3.7905868334822757e-06, "loss": 11.0654, "num_input_tokens_seen": 294688, "step": 510 }, { "epoch": 0.07670539171879655, "grad_norm": 1.9895023107528687, "learning_rate": 3.827822460530236e-06, "loss": 10.8012, "num_input_tokens_seen": 297792, "step": 515 }, { "epoch": 0.07745010425975574, "grad_norm": 1.9533551931381226, "learning_rate": 3.8650580875781954e-06, "loss": 11.0254, "num_input_tokens_seen": 300736, "step": 520 }, { "epoch": 0.07819481680071493, "grad_norm": 2.177260637283325, "learning_rate": 3.9022937146261545e-06, "loss": 10.7824, "num_input_tokens_seen": 303488, "step": 525 }, { "epoch": 0.07893952934167411, "grad_norm": 2.0749213695526123, "learning_rate": 3.939529341674114e-06, "loss": 10.63, "num_input_tokens_seen": 306624, "step": 530 }, { "epoch": 0.0796842418826333, "grad_norm": 2.234720230102539, "learning_rate": 3.976764968722073e-06, "loss": 10.6557, "num_input_tokens_seen": 309792, "step": 535 }, { "epoch": 0.08042895442359249, "grad_norm": 1.7911285161972046, "learning_rate": 4.014000595770033e-06, "loss": 10.8511, "num_input_tokens_seen": 312576, "step": 540 }, { "epoch": 0.08117366696455168, "grad_norm": 2.4040207862854004, "learning_rate": 4.051236222817992e-06, "loss": 11.1237, "num_input_tokens_seen": 315360, "step": 545 }, { "epoch": 0.08191837950551087, "grad_norm": 2.1303703784942627, "learning_rate": 4.088471849865952e-06, "loss": 10.7885, "num_input_tokens_seen": 318016, "step": 550 }, { "epoch": 0.08266309204647006, "grad_norm": 2.029956817626953, "learning_rate": 4.125707476913911e-06, "loss": 10.6039, "num_input_tokens_seen": 320768, "step": 555 }, { "epoch": 0.08340780458742925, "grad_norm": 2.089937925338745, "learning_rate": 4.162943103961871e-06, "loss": 11.1311, "num_input_tokens_seen": 323680, "step": 560 }, { "epoch": 0.08415251712838845, "grad_norm": 2.3609962463378906, "learning_rate": 4.20017873100983e-06, "loss": 10.5363, "num_input_tokens_seen": 326624, "step": 565 }, { "epoch": 0.08489722966934764, "grad_norm": 2.2993454933166504, "learning_rate": 4.23741435805779e-06, "loss": 10.7163, "num_input_tokens_seen": 329216, "step": 570 }, { "epoch": 0.08564194221030683, "grad_norm": 1.8907978534698486, "learning_rate": 4.274649985105749e-06, "loss": 10.7221, "num_input_tokens_seen": 331904, "step": 575 }, { "epoch": 0.08638665475126601, "grad_norm": 2.135169744491577, "learning_rate": 4.3118856121537094e-06, "loss": 10.8286, "num_input_tokens_seen": 334752, "step": 580 }, { "epoch": 0.0871313672922252, "grad_norm": 2.0691521167755127, "learning_rate": 4.3491212392016685e-06, "loss": 10.5857, "num_input_tokens_seen": 337984, "step": 585 }, { "epoch": 0.08787607983318439, "grad_norm": 1.8288371562957764, "learning_rate": 4.386356866249628e-06, "loss": 10.7884, "num_input_tokens_seen": 340800, "step": 590 }, { "epoch": 0.08862079237414358, "grad_norm": 1.9544728994369507, "learning_rate": 4.423592493297587e-06, "loss": 10.5613, "num_input_tokens_seen": 343424, "step": 595 }, { "epoch": 0.08936550491510277, "grad_norm": 2.2080373764038086, "learning_rate": 4.460828120345547e-06, "loss": 10.2494, "num_input_tokens_seen": 346720, "step": 600 }, { "epoch": 0.09011021745606196, "grad_norm": 1.9826154708862305, "learning_rate": 4.498063747393506e-06, "loss": 10.6137, "num_input_tokens_seen": 349728, "step": 605 }, { "epoch": 0.09085492999702115, "grad_norm": 2.2852494716644287, "learning_rate": 4.535299374441466e-06, "loss": 10.4434, "num_input_tokens_seen": 352352, "step": 610 }, { "epoch": 0.09159964253798034, "grad_norm": 2.068514347076416, "learning_rate": 4.572535001489425e-06, "loss": 10.6016, "num_input_tokens_seen": 355168, "step": 615 }, { "epoch": 0.09234435507893952, "grad_norm": 2.2334346771240234, "learning_rate": 4.609770628537385e-06, "loss": 10.7799, "num_input_tokens_seen": 357920, "step": 620 }, { "epoch": 0.09308906761989871, "grad_norm": 1.9635531902313232, "learning_rate": 4.647006255585344e-06, "loss": 10.1634, "num_input_tokens_seen": 360704, "step": 625 }, { "epoch": 0.0938337801608579, "grad_norm": 1.886508822441101, "learning_rate": 4.684241882633304e-06, "loss": 10.6419, "num_input_tokens_seen": 363872, "step": 630 }, { "epoch": 0.0945784927018171, "grad_norm": 1.8760178089141846, "learning_rate": 4.721477509681263e-06, "loss": 10.4323, "num_input_tokens_seen": 366976, "step": 635 }, { "epoch": 0.0953232052427763, "grad_norm": 2.1390514373779297, "learning_rate": 4.758713136729223e-06, "loss": 10.5525, "num_input_tokens_seen": 369824, "step": 640 }, { "epoch": 0.09606791778373548, "grad_norm": 2.6132924556732178, "learning_rate": 4.7959487637771824e-06, "loss": 9.9849, "num_input_tokens_seen": 372832, "step": 645 }, { "epoch": 0.09681263032469467, "grad_norm": 2.092278242111206, "learning_rate": 4.833184390825142e-06, "loss": 10.586, "num_input_tokens_seen": 375872, "step": 650 }, { "epoch": 0.09755734286565386, "grad_norm": 2.355943441390991, "learning_rate": 4.870420017873101e-06, "loss": 10.5323, "num_input_tokens_seen": 378752, "step": 655 }, { "epoch": 0.09830205540661305, "grad_norm": 2.050346851348877, "learning_rate": 4.907655644921061e-06, "loss": 10.0969, "num_input_tokens_seen": 381632, "step": 660 }, { "epoch": 0.09904676794757224, "grad_norm": 1.7758240699768066, "learning_rate": 4.94489127196902e-06, "loss": 10.2259, "num_input_tokens_seen": 384832, "step": 665 }, { "epoch": 0.09979148048853143, "grad_norm": 1.876861572265625, "learning_rate": 4.98212689901698e-06, "loss": 10.2384, "num_input_tokens_seen": 388192, "step": 670 }, { "epoch": 0.10053619302949061, "grad_norm": 2.1082050800323486, "learning_rate": 5.019362526064939e-06, "loss": 10.131, "num_input_tokens_seen": 391072, "step": 675 }, { "epoch": 0.1012809055704498, "grad_norm": 1.8649810552597046, "learning_rate": 5.056598153112899e-06, "loss": 10.3725, "num_input_tokens_seen": 393792, "step": 680 }, { "epoch": 0.10202561811140899, "grad_norm": 2.3847506046295166, "learning_rate": 5.093833780160858e-06, "loss": 10.1053, "num_input_tokens_seen": 396800, "step": 685 }, { "epoch": 0.10277033065236818, "grad_norm": 2.1582956314086914, "learning_rate": 5.131069407208818e-06, "loss": 10.2065, "num_input_tokens_seen": 399648, "step": 690 }, { "epoch": 0.10351504319332737, "grad_norm": 2.1493117809295654, "learning_rate": 5.168305034256777e-06, "loss": 10.0892, "num_input_tokens_seen": 402528, "step": 695 }, { "epoch": 0.10425975573428657, "grad_norm": 2.1465907096862793, "learning_rate": 5.205540661304737e-06, "loss": 10.1106, "num_input_tokens_seen": 405696, "step": 700 }, { "epoch": 0.10500446827524576, "grad_norm": 2.043977975845337, "learning_rate": 5.242776288352696e-06, "loss": 10.2986, "num_input_tokens_seen": 408864, "step": 705 }, { "epoch": 0.10574918081620495, "grad_norm": 2.055931806564331, "learning_rate": 5.2800119154006555e-06, "loss": 9.9672, "num_input_tokens_seen": 411616, "step": 710 }, { "epoch": 0.10649389335716414, "grad_norm": 2.41117000579834, "learning_rate": 5.317247542448615e-06, "loss": 9.8675, "num_input_tokens_seen": 414528, "step": 715 }, { "epoch": 0.10723860589812333, "grad_norm": 2.2369418144226074, "learning_rate": 5.354483169496575e-06, "loss": 10.0619, "num_input_tokens_seen": 417440, "step": 720 }, { "epoch": 0.10798331843908252, "grad_norm": 2.118785858154297, "learning_rate": 5.391718796544534e-06, "loss": 10.2616, "num_input_tokens_seen": 420288, "step": 725 }, { "epoch": 0.1087280309800417, "grad_norm": 2.3948447704315186, "learning_rate": 5.428954423592494e-06, "loss": 10.0413, "num_input_tokens_seen": 423168, "step": 730 }, { "epoch": 0.10947274352100089, "grad_norm": 2.1954169273376465, "learning_rate": 5.466190050640453e-06, "loss": 9.8698, "num_input_tokens_seen": 426176, "step": 735 }, { "epoch": 0.11021745606196008, "grad_norm": 1.9468891620635986, "learning_rate": 5.503425677688413e-06, "loss": 9.6922, "num_input_tokens_seen": 429344, "step": 740 }, { "epoch": 0.11096216860291927, "grad_norm": 2.0778276920318604, "learning_rate": 5.540661304736372e-06, "loss": 9.9068, "num_input_tokens_seen": 432032, "step": 745 }, { "epoch": 0.11170688114387846, "grad_norm": 2.2648956775665283, "learning_rate": 5.577896931784332e-06, "loss": 9.761, "num_input_tokens_seen": 434784, "step": 750 }, { "epoch": 0.11245159368483765, "grad_norm": 1.937353491783142, "learning_rate": 5.615132558832291e-06, "loss": 10.106, "num_input_tokens_seen": 437952, "step": 755 }, { "epoch": 0.11319630622579684, "grad_norm": 1.9985944032669067, "learning_rate": 5.652368185880251e-06, "loss": 9.697, "num_input_tokens_seen": 440672, "step": 760 }, { "epoch": 0.11394101876675604, "grad_norm": 1.9219201803207397, "learning_rate": 5.68960381292821e-06, "loss": 9.7447, "num_input_tokens_seen": 443680, "step": 765 }, { "epoch": 0.11468573130771523, "grad_norm": 2.06246280670166, "learning_rate": 5.7268394399761695e-06, "loss": 9.6068, "num_input_tokens_seen": 446400, "step": 770 }, { "epoch": 0.11543044384867442, "grad_norm": 2.2600436210632324, "learning_rate": 5.7640750670241285e-06, "loss": 9.505, "num_input_tokens_seen": 449184, "step": 775 }, { "epoch": 0.1161751563896336, "grad_norm": 1.9767109155654907, "learning_rate": 5.801310694072089e-06, "loss": 9.7004, "num_input_tokens_seen": 452032, "step": 780 }, { "epoch": 0.1169198689305928, "grad_norm": 2.0418713092803955, "learning_rate": 5.838546321120048e-06, "loss": 9.8322, "num_input_tokens_seen": 455072, "step": 785 }, { "epoch": 0.11766458147155198, "grad_norm": 2.2258667945861816, "learning_rate": 5.875781948168008e-06, "loss": 9.8638, "num_input_tokens_seen": 457760, "step": 790 }, { "epoch": 0.11840929401251117, "grad_norm": 2.300048828125, "learning_rate": 5.913017575215967e-06, "loss": 9.664, "num_input_tokens_seen": 460608, "step": 795 }, { "epoch": 0.11915400655347036, "grad_norm": 2.191221237182617, "learning_rate": 5.950253202263927e-06, "loss": 9.6458, "num_input_tokens_seen": 463456, "step": 800 }, { "epoch": 0.11989871909442955, "grad_norm": 2.2352774143218994, "learning_rate": 5.987488829311886e-06, "loss": 9.6353, "num_input_tokens_seen": 466336, "step": 805 }, { "epoch": 0.12064343163538874, "grad_norm": 2.058495283126831, "learning_rate": 6.024724456359846e-06, "loss": 9.7419, "num_input_tokens_seen": 469280, "step": 810 }, { "epoch": 0.12138814417634793, "grad_norm": 1.9756073951721191, "learning_rate": 6.061960083407805e-06, "loss": 9.3537, "num_input_tokens_seen": 472096, "step": 815 }, { "epoch": 0.12213285671730711, "grad_norm": 2.1040568351745605, "learning_rate": 6.099195710455765e-06, "loss": 9.6838, "num_input_tokens_seen": 474944, "step": 820 }, { "epoch": 0.1228775692582663, "grad_norm": 1.915040373802185, "learning_rate": 6.136431337503724e-06, "loss": 9.7152, "num_input_tokens_seen": 477696, "step": 825 }, { "epoch": 0.1236222817992255, "grad_norm": 1.715733289718628, "learning_rate": 6.1736669645516834e-06, "loss": 9.4561, "num_input_tokens_seen": 480672, "step": 830 }, { "epoch": 0.1243669943401847, "grad_norm": 1.9369769096374512, "learning_rate": 6.2109025915996425e-06, "loss": 9.1817, "num_input_tokens_seen": 483712, "step": 835 }, { "epoch": 0.12511170688114387, "grad_norm": 1.9337126016616821, "learning_rate": 6.248138218647602e-06, "loss": 9.1897, "num_input_tokens_seen": 486528, "step": 840 }, { "epoch": 0.12585641942210307, "grad_norm": 1.7353464365005493, "learning_rate": 6.285373845695562e-06, "loss": 9.4632, "num_input_tokens_seen": 489184, "step": 845 }, { "epoch": 0.12660113196306225, "grad_norm": 2.3471813201904297, "learning_rate": 6.322609472743521e-06, "loss": 9.0991, "num_input_tokens_seen": 492096, "step": 850 }, { "epoch": 0.12734584450402145, "grad_norm": 2.09800124168396, "learning_rate": 6.359845099791481e-06, "loss": 9.012, "num_input_tokens_seen": 495168, "step": 855 }, { "epoch": 0.12809055704498062, "grad_norm": 2.0434625148773193, "learning_rate": 6.39708072683944e-06, "loss": 9.3701, "num_input_tokens_seen": 498112, "step": 860 }, { "epoch": 0.12883526958593983, "grad_norm": 2.306313991546631, "learning_rate": 6.4343163538874e-06, "loss": 9.0579, "num_input_tokens_seen": 501216, "step": 865 }, { "epoch": 0.12957998212689903, "grad_norm": 1.8980270624160767, "learning_rate": 6.471551980935359e-06, "loss": 9.0539, "num_input_tokens_seen": 504000, "step": 870 }, { "epoch": 0.1303246946678582, "grad_norm": 2.079333543777466, "learning_rate": 6.508787607983319e-06, "loss": 9.5202, "num_input_tokens_seen": 506912, "step": 875 }, { "epoch": 0.1310694072088174, "grad_norm": 2.2002434730529785, "learning_rate": 6.546023235031279e-06, "loss": 9.1785, "num_input_tokens_seen": 509696, "step": 880 }, { "epoch": 0.13181411974977658, "grad_norm": 1.9346833229064941, "learning_rate": 6.583258862079238e-06, "loss": 9.2682, "num_input_tokens_seen": 512544, "step": 885 }, { "epoch": 0.13255883229073578, "grad_norm": 2.2415595054626465, "learning_rate": 6.6204944891271974e-06, "loss": 9.1969, "num_input_tokens_seen": 515392, "step": 890 }, { "epoch": 0.13330354483169496, "grad_norm": 1.9097265005111694, "learning_rate": 6.6577301161751565e-06, "loss": 9.114, "num_input_tokens_seen": 518560, "step": 895 }, { "epoch": 0.13404825737265416, "grad_norm": 1.8732140064239502, "learning_rate": 6.694965743223116e-06, "loss": 9.3423, "num_input_tokens_seen": 521440, "step": 900 }, { "epoch": 0.13479296991361334, "grad_norm": 2.3684871196746826, "learning_rate": 6.732201370271075e-06, "loss": 9.1667, "num_input_tokens_seen": 524352, "step": 905 }, { "epoch": 0.13553768245457254, "grad_norm": 2.3284409046173096, "learning_rate": 6.769436997319035e-06, "loss": 8.9228, "num_input_tokens_seen": 527808, "step": 910 }, { "epoch": 0.1362823949955317, "grad_norm": 3.8652827739715576, "learning_rate": 6.806672624366994e-06, "loss": 9.1051, "num_input_tokens_seen": 530688, "step": 915 }, { "epoch": 0.13702710753649092, "grad_norm": 2.329401969909668, "learning_rate": 6.843908251414954e-06, "loss": 9.0129, "num_input_tokens_seen": 533408, "step": 920 }, { "epoch": 0.1377718200774501, "grad_norm": 2.200260639190674, "learning_rate": 6.881143878462913e-06, "loss": 8.8425, "num_input_tokens_seen": 536480, "step": 925 }, { "epoch": 0.1385165326184093, "grad_norm": 2.217315912246704, "learning_rate": 6.918379505510873e-06, "loss": 8.8963, "num_input_tokens_seen": 539072, "step": 930 }, { "epoch": 0.1392612451593685, "grad_norm": 2.111829996109009, "learning_rate": 6.955615132558832e-06, "loss": 8.9241, "num_input_tokens_seen": 542208, "step": 935 }, { "epoch": 0.14000595770032767, "grad_norm": 2.2751564979553223, "learning_rate": 6.992850759606792e-06, "loss": 8.8181, "num_input_tokens_seen": 545152, "step": 940 }, { "epoch": 0.14075067024128687, "grad_norm": 2.395211935043335, "learning_rate": 7.0300863866547524e-06, "loss": 8.6468, "num_input_tokens_seen": 547776, "step": 945 }, { "epoch": 0.14149538278224605, "grad_norm": 1.9969617128372192, "learning_rate": 7.067322013702711e-06, "loss": 8.8753, "num_input_tokens_seen": 550656, "step": 950 }, { "epoch": 0.14224009532320525, "grad_norm": 2.2365922927856445, "learning_rate": 7.104557640750671e-06, "loss": 8.5128, "num_input_tokens_seen": 553376, "step": 955 }, { "epoch": 0.14298480786416443, "grad_norm": 2.0293684005737305, "learning_rate": 7.14179326779863e-06, "loss": 8.8969, "num_input_tokens_seen": 556512, "step": 960 }, { "epoch": 0.14372952040512363, "grad_norm": 1.9731566905975342, "learning_rate": 7.17902889484659e-06, "loss": 8.7756, "num_input_tokens_seen": 559520, "step": 965 }, { "epoch": 0.1444742329460828, "grad_norm": 2.7694454193115234, "learning_rate": 7.216264521894549e-06, "loss": 8.9774, "num_input_tokens_seen": 562464, "step": 970 }, { "epoch": 0.145218945487042, "grad_norm": 2.0134096145629883, "learning_rate": 7.253500148942509e-06, "loss": 8.5534, "num_input_tokens_seen": 564960, "step": 975 }, { "epoch": 0.14596365802800118, "grad_norm": 2.1638946533203125, "learning_rate": 7.290735775990468e-06, "loss": 8.5909, "num_input_tokens_seen": 568032, "step": 980 }, { "epoch": 0.14670837056896038, "grad_norm": 1.9823126792907715, "learning_rate": 7.327971403038428e-06, "loss": 8.7011, "num_input_tokens_seen": 570912, "step": 985 }, { "epoch": 0.14745308310991956, "grad_norm": 2.086918830871582, "learning_rate": 7.365207030086387e-06, "loss": 8.3865, "num_input_tokens_seen": 573696, "step": 990 }, { "epoch": 0.14819779565087876, "grad_norm": 1.8331801891326904, "learning_rate": 7.402442657134347e-06, "loss": 8.654, "num_input_tokens_seen": 576480, "step": 995 }, { "epoch": 0.14894250819183796, "grad_norm": 2.2954792976379395, "learning_rate": 7.439678284182306e-06, "loss": 8.5483, "num_input_tokens_seen": 579424, "step": 1000 }, { "epoch": 0.14968722073279714, "grad_norm": 1.8098444938659668, "learning_rate": 7.476913911230266e-06, "loss": 8.2913, "num_input_tokens_seen": 582368, "step": 1005 }, { "epoch": 0.15043193327375634, "grad_norm": 2.0405702590942383, "learning_rate": 7.5141495382782254e-06, "loss": 8.4606, "num_input_tokens_seen": 585408, "step": 1010 }, { "epoch": 0.15117664581471552, "grad_norm": 2.0061416625976562, "learning_rate": 7.5513851653261844e-06, "loss": 8.4838, "num_input_tokens_seen": 588288, "step": 1015 }, { "epoch": 0.15192135835567472, "grad_norm": 2.8245115280151367, "learning_rate": 7.588620792374144e-06, "loss": 8.8155, "num_input_tokens_seen": 591200, "step": 1020 }, { "epoch": 0.1526660708966339, "grad_norm": 2.026503324508667, "learning_rate": 7.625856419422103e-06, "loss": 8.5152, "num_input_tokens_seen": 594048, "step": 1025 }, { "epoch": 0.1534107834375931, "grad_norm": 2.0989878177642822, "learning_rate": 7.663092046470063e-06, "loss": 8.3329, "num_input_tokens_seen": 597024, "step": 1030 }, { "epoch": 0.15415549597855227, "grad_norm": 2.4671192169189453, "learning_rate": 7.700327673518021e-06, "loss": 8.4159, "num_input_tokens_seen": 599840, "step": 1035 }, { "epoch": 0.15490020851951147, "grad_norm": 2.839299440383911, "learning_rate": 7.737563300565983e-06, "loss": 8.5415, "num_input_tokens_seen": 602656, "step": 1040 }, { "epoch": 0.15564492106047065, "grad_norm": 2.409449338912964, "learning_rate": 7.774798927613941e-06, "loss": 8.3726, "num_input_tokens_seen": 605760, "step": 1045 }, { "epoch": 0.15638963360142985, "grad_norm": 2.3488357067108154, "learning_rate": 7.812034554661901e-06, "loss": 8.4111, "num_input_tokens_seen": 608448, "step": 1050 }, { "epoch": 0.15713434614238903, "grad_norm": 1.9479529857635498, "learning_rate": 7.84927018170986e-06, "loss": 8.2155, "num_input_tokens_seen": 611296, "step": 1055 }, { "epoch": 0.15787905868334823, "grad_norm": 2.379605293273926, "learning_rate": 7.88650580875782e-06, "loss": 7.9793, "num_input_tokens_seen": 614272, "step": 1060 }, { "epoch": 0.15862377122430743, "grad_norm": 1.8622087240219116, "learning_rate": 7.923741435805779e-06, "loss": 7.9074, "num_input_tokens_seen": 617504, "step": 1065 }, { "epoch": 0.1593684837652666, "grad_norm": 2.0914347171783447, "learning_rate": 7.960977062853739e-06, "loss": 8.1156, "num_input_tokens_seen": 620384, "step": 1070 }, { "epoch": 0.1601131963062258, "grad_norm": 1.9552392959594727, "learning_rate": 7.998212689901698e-06, "loss": 8.1081, "num_input_tokens_seen": 623360, "step": 1075 }, { "epoch": 0.16085790884718498, "grad_norm": 1.9723215103149414, "learning_rate": 8.035448316949658e-06, "loss": 8.4374, "num_input_tokens_seen": 626336, "step": 1080 }, { "epoch": 0.16160262138814419, "grad_norm": 1.9264206886291504, "learning_rate": 8.072683943997618e-06, "loss": 7.9917, "num_input_tokens_seen": 629472, "step": 1085 }, { "epoch": 0.16234733392910336, "grad_norm": 2.1905105113983154, "learning_rate": 8.109919571045576e-06, "loss": 8.0082, "num_input_tokens_seen": 632608, "step": 1090 }, { "epoch": 0.16309204647006256, "grad_norm": 2.186253547668457, "learning_rate": 8.147155198093536e-06, "loss": 8.0982, "num_input_tokens_seen": 635680, "step": 1095 }, { "epoch": 0.16383675901102174, "grad_norm": 1.9650533199310303, "learning_rate": 8.184390825141496e-06, "loss": 7.929, "num_input_tokens_seen": 638304, "step": 1100 }, { "epoch": 0.16458147155198094, "grad_norm": 2.390148639678955, "learning_rate": 8.221626452189456e-06, "loss": 8.0881, "num_input_tokens_seen": 641312, "step": 1105 }, { "epoch": 0.16532618409294011, "grad_norm": 2.2032392024993896, "learning_rate": 8.258862079237414e-06, "loss": 8.0091, "num_input_tokens_seen": 644160, "step": 1110 }, { "epoch": 0.16607089663389932, "grad_norm": 2.5147902965545654, "learning_rate": 8.296097706285374e-06, "loss": 7.9754, "num_input_tokens_seen": 646912, "step": 1115 }, { "epoch": 0.1668156091748585, "grad_norm": 1.886702537536621, "learning_rate": 8.333333333333334e-06, "loss": 7.6772, "num_input_tokens_seen": 649888, "step": 1120 }, { "epoch": 0.1675603217158177, "grad_norm": 1.762952208518982, "learning_rate": 8.370568960381294e-06, "loss": 7.6859, "num_input_tokens_seen": 652672, "step": 1125 }, { "epoch": 0.1683050342567769, "grad_norm": 2.6452548503875732, "learning_rate": 8.407804587429252e-06, "loss": 7.7253, "num_input_tokens_seen": 655072, "step": 1130 }, { "epoch": 0.16904974679773607, "grad_norm": 2.5326364040374756, "learning_rate": 8.445040214477212e-06, "loss": 7.9246, "num_input_tokens_seen": 658112, "step": 1135 }, { "epoch": 0.16979445933869527, "grad_norm": 1.8746446371078491, "learning_rate": 8.482275841525171e-06, "loss": 8.2764, "num_input_tokens_seen": 661024, "step": 1140 }, { "epoch": 0.17053917187965445, "grad_norm": 2.0305233001708984, "learning_rate": 8.519511468573131e-06, "loss": 7.8119, "num_input_tokens_seen": 663936, "step": 1145 }, { "epoch": 0.17128388442061365, "grad_norm": 1.8093509674072266, "learning_rate": 8.556747095621091e-06, "loss": 7.8591, "num_input_tokens_seen": 666784, "step": 1150 }, { "epoch": 0.17202859696157283, "grad_norm": 1.6541447639465332, "learning_rate": 8.59398272266905e-06, "loss": 7.7003, "num_input_tokens_seen": 669984, "step": 1155 }, { "epoch": 0.17277330950253203, "grad_norm": 1.804046869277954, "learning_rate": 8.631218349717011e-06, "loss": 7.395, "num_input_tokens_seen": 672672, "step": 1160 }, { "epoch": 0.1735180220434912, "grad_norm": 1.832078456878662, "learning_rate": 8.668453976764969e-06, "loss": 7.6697, "num_input_tokens_seen": 675584, "step": 1165 }, { "epoch": 0.1742627345844504, "grad_norm": 2.200876474380493, "learning_rate": 8.705689603812929e-06, "loss": 7.5648, "num_input_tokens_seen": 678336, "step": 1170 }, { "epoch": 0.17500744712540958, "grad_norm": 2.135098695755005, "learning_rate": 8.742925230860887e-06, "loss": 7.7573, "num_input_tokens_seen": 681120, "step": 1175 }, { "epoch": 0.17575215966636878, "grad_norm": 2.0691916942596436, "learning_rate": 8.780160857908849e-06, "loss": 7.5043, "num_input_tokens_seen": 683936, "step": 1180 }, { "epoch": 0.17649687220732796, "grad_norm": 1.8354893922805786, "learning_rate": 8.817396484956807e-06, "loss": 7.5811, "num_input_tokens_seen": 686816, "step": 1185 }, { "epoch": 0.17724158474828716, "grad_norm": 1.8321928977966309, "learning_rate": 8.854632112004767e-06, "loss": 7.5391, "num_input_tokens_seen": 689664, "step": 1190 }, { "epoch": 0.17798629728924636, "grad_norm": 1.8286962509155273, "learning_rate": 8.891867739052725e-06, "loss": 7.2508, "num_input_tokens_seen": 692320, "step": 1195 }, { "epoch": 0.17873100983020554, "grad_norm": 1.6511356830596924, "learning_rate": 8.929103366100686e-06, "loss": 7.4119, "num_input_tokens_seen": 695136, "step": 1200 }, { "epoch": 0.17947572237116474, "grad_norm": 2.2797484397888184, "learning_rate": 8.966338993148644e-06, "loss": 7.4659, "num_input_tokens_seen": 698048, "step": 1205 }, { "epoch": 0.18022043491212392, "grad_norm": 2.159371852874756, "learning_rate": 9.003574620196604e-06, "loss": 7.2155, "num_input_tokens_seen": 700864, "step": 1210 }, { "epoch": 0.18096514745308312, "grad_norm": 2.1275620460510254, "learning_rate": 9.040810247244564e-06, "loss": 7.2528, "num_input_tokens_seen": 703872, "step": 1215 }, { "epoch": 0.1817098599940423, "grad_norm": 1.9150006771087646, "learning_rate": 9.078045874292524e-06, "loss": 7.5387, "num_input_tokens_seen": 706496, "step": 1220 }, { "epoch": 0.1824545725350015, "grad_norm": 2.073683023452759, "learning_rate": 9.115281501340484e-06, "loss": 7.222, "num_input_tokens_seen": 709440, "step": 1225 }, { "epoch": 0.18319928507596067, "grad_norm": 1.7260866165161133, "learning_rate": 9.152517128388442e-06, "loss": 7.2177, "num_input_tokens_seen": 712128, "step": 1230 }, { "epoch": 0.18394399761691987, "grad_norm": 2.14578914642334, "learning_rate": 9.189752755436402e-06, "loss": 7.2488, "num_input_tokens_seen": 714912, "step": 1235 }, { "epoch": 0.18468871015787905, "grad_norm": 2.189309597015381, "learning_rate": 9.226988382484362e-06, "loss": 7.2164, "num_input_tokens_seen": 717568, "step": 1240 }, { "epoch": 0.18543342269883825, "grad_norm": 2.1209731101989746, "learning_rate": 9.264224009532322e-06, "loss": 7.1643, "num_input_tokens_seen": 720704, "step": 1245 }, { "epoch": 0.18617813523979743, "grad_norm": 1.9093120098114014, "learning_rate": 9.30145963658028e-06, "loss": 7.1584, "num_input_tokens_seen": 723680, "step": 1250 }, { "epoch": 0.18692284778075663, "grad_norm": 1.82891845703125, "learning_rate": 9.33869526362824e-06, "loss": 7.3261, "num_input_tokens_seen": 726592, "step": 1255 }, { "epoch": 0.1876675603217158, "grad_norm": 2.3152081966400146, "learning_rate": 9.3759308906762e-06, "loss": 7.2368, "num_input_tokens_seen": 729536, "step": 1260 }, { "epoch": 0.188412272862675, "grad_norm": 1.9074846506118774, "learning_rate": 9.41316651772416e-06, "loss": 7.2149, "num_input_tokens_seen": 732544, "step": 1265 }, { "epoch": 0.1891569854036342, "grad_norm": 1.987952470779419, "learning_rate": 9.450402144772117e-06, "loss": 7.2286, "num_input_tokens_seen": 735712, "step": 1270 }, { "epoch": 0.18990169794459338, "grad_norm": 2.0022411346435547, "learning_rate": 9.487637771820077e-06, "loss": 7.1769, "num_input_tokens_seen": 738304, "step": 1275 }, { "epoch": 0.1906464104855526, "grad_norm": 1.9108846187591553, "learning_rate": 9.524873398868039e-06, "loss": 7.1065, "num_input_tokens_seen": 741248, "step": 1280 }, { "epoch": 0.19139112302651176, "grad_norm": 2.046776294708252, "learning_rate": 9.562109025915997e-06, "loss": 7.043, "num_input_tokens_seen": 744192, "step": 1285 }, { "epoch": 0.19213583556747096, "grad_norm": 1.981946587562561, "learning_rate": 9.599344652963957e-06, "loss": 6.8734, "num_input_tokens_seen": 747104, "step": 1290 }, { "epoch": 0.19288054810843014, "grad_norm": 2.4360899925231934, "learning_rate": 9.636580280011915e-06, "loss": 6.9457, "num_input_tokens_seen": 750048, "step": 1295 }, { "epoch": 0.19362526064938934, "grad_norm": 2.179546594619751, "learning_rate": 9.673815907059877e-06, "loss": 6.9482, "num_input_tokens_seen": 752992, "step": 1300 }, { "epoch": 0.19436997319034852, "grad_norm": 1.7435579299926758, "learning_rate": 9.711051534107835e-06, "loss": 6.6958, "num_input_tokens_seen": 755904, "step": 1305 }, { "epoch": 0.19511468573130772, "grad_norm": 1.8465232849121094, "learning_rate": 9.748287161155795e-06, "loss": 6.956, "num_input_tokens_seen": 758752, "step": 1310 }, { "epoch": 0.1958593982722669, "grad_norm": 1.6556588411331177, "learning_rate": 9.785522788203753e-06, "loss": 6.8085, "num_input_tokens_seen": 761344, "step": 1315 }, { "epoch": 0.1966041108132261, "grad_norm": 1.8582195043563843, "learning_rate": 9.822758415251714e-06, "loss": 6.9532, "num_input_tokens_seen": 764160, "step": 1320 }, { "epoch": 0.19734882335418527, "grad_norm": 1.9507546424865723, "learning_rate": 9.859994042299672e-06, "loss": 6.5696, "num_input_tokens_seen": 767104, "step": 1325 }, { "epoch": 0.19809353589514447, "grad_norm": 2.313680410385132, "learning_rate": 9.897229669347632e-06, "loss": 6.462, "num_input_tokens_seen": 769824, "step": 1330 }, { "epoch": 0.19883824843610368, "grad_norm": 1.9846305847167969, "learning_rate": 9.93446529639559e-06, "loss": 6.5817, "num_input_tokens_seen": 772544, "step": 1335 }, { "epoch": 0.19958296097706285, "grad_norm": 1.8198497295379639, "learning_rate": 9.971700923443552e-06, "loss": 6.874, "num_input_tokens_seen": 775360, "step": 1340 }, { "epoch": 0.20032767351802205, "grad_norm": 2.2677013874053955, "learning_rate": 1.0008936550491512e-05, "loss": 6.7567, "num_input_tokens_seen": 778144, "step": 1345 }, { "epoch": 0.20107238605898123, "grad_norm": 2.083909273147583, "learning_rate": 1.004617217753947e-05, "loss": 6.5919, "num_input_tokens_seen": 780864, "step": 1350 }, { "epoch": 0.20181709859994043, "grad_norm": 2.6609630584716797, "learning_rate": 1.008340780458743e-05, "loss": 6.6041, "num_input_tokens_seen": 783776, "step": 1355 }, { "epoch": 0.2025618111408996, "grad_norm": 2.807002067565918, "learning_rate": 1.012064343163539e-05, "loss": 6.7817, "num_input_tokens_seen": 786752, "step": 1360 }, { "epoch": 0.2033065236818588, "grad_norm": 1.9496326446533203, "learning_rate": 1.015787905868335e-05, "loss": 6.4856, "num_input_tokens_seen": 789536, "step": 1365 }, { "epoch": 0.20405123622281798, "grad_norm": 1.8807252645492554, "learning_rate": 1.0195114685731308e-05, "loss": 6.0728, "num_input_tokens_seen": 792160, "step": 1370 }, { "epoch": 0.20479594876377719, "grad_norm": 1.6999800205230713, "learning_rate": 1.0232350312779268e-05, "loss": 6.3461, "num_input_tokens_seen": 794912, "step": 1375 }, { "epoch": 0.20554066130473636, "grad_norm": 3.4120748043060303, "learning_rate": 1.0269585939827227e-05, "loss": 6.6033, "num_input_tokens_seen": 797824, "step": 1380 }, { "epoch": 0.20628537384569556, "grad_norm": 1.9459702968597412, "learning_rate": 1.0306821566875187e-05, "loss": 6.5561, "num_input_tokens_seen": 801184, "step": 1385 }, { "epoch": 0.20703008638665474, "grad_norm": 1.6182076930999756, "learning_rate": 1.0344057193923145e-05, "loss": 6.4149, "num_input_tokens_seen": 804032, "step": 1390 }, { "epoch": 0.20777479892761394, "grad_norm": 2.346850872039795, "learning_rate": 1.0381292820971105e-05, "loss": 6.4098, "num_input_tokens_seen": 806912, "step": 1395 }, { "epoch": 0.20851951146857314, "grad_norm": 2.1594152450561523, "learning_rate": 1.0418528448019065e-05, "loss": 6.1574, "num_input_tokens_seen": 809472, "step": 1400 }, { "epoch": 0.20926422400953232, "grad_norm": 2.1090168952941895, "learning_rate": 1.0455764075067025e-05, "loss": 6.3134, "num_input_tokens_seen": 812064, "step": 1405 }, { "epoch": 0.21000893655049152, "grad_norm": 2.024420976638794, "learning_rate": 1.0492999702114985e-05, "loss": 6.5884, "num_input_tokens_seen": 815232, "step": 1410 }, { "epoch": 0.2107536490914507, "grad_norm": 1.755567193031311, "learning_rate": 1.0530235329162943e-05, "loss": 6.2851, "num_input_tokens_seen": 818080, "step": 1415 }, { "epoch": 0.2114983616324099, "grad_norm": 1.9582843780517578, "learning_rate": 1.0567470956210903e-05, "loss": 6.4092, "num_input_tokens_seen": 820832, "step": 1420 }, { "epoch": 0.21224307417336907, "grad_norm": 2.4973018169403076, "learning_rate": 1.0604706583258863e-05, "loss": 6.2492, "num_input_tokens_seen": 823584, "step": 1425 }, { "epoch": 0.21298778671432828, "grad_norm": 2.0227766036987305, "learning_rate": 1.0641942210306823e-05, "loss": 6.0892, "num_input_tokens_seen": 826240, "step": 1430 }, { "epoch": 0.21373249925528745, "grad_norm": 1.9202308654785156, "learning_rate": 1.067917783735478e-05, "loss": 6.1673, "num_input_tokens_seen": 828928, "step": 1435 }, { "epoch": 0.21447721179624665, "grad_norm": 2.1308364868164062, "learning_rate": 1.071641346440274e-05, "loss": 6.2108, "num_input_tokens_seen": 832128, "step": 1440 }, { "epoch": 0.21522192433720583, "grad_norm": 1.8494932651519775, "learning_rate": 1.07536490914507e-05, "loss": 6.1795, "num_input_tokens_seen": 835616, "step": 1445 }, { "epoch": 0.21596663687816503, "grad_norm": 2.0794527530670166, "learning_rate": 1.079088471849866e-05, "loss": 6.1956, "num_input_tokens_seen": 838528, "step": 1450 }, { "epoch": 0.2167113494191242, "grad_norm": 2.3897407054901123, "learning_rate": 1.0828120345546618e-05, "loss": 5.945, "num_input_tokens_seen": 841440, "step": 1455 }, { "epoch": 0.2174560619600834, "grad_norm": 2.4614768028259277, "learning_rate": 1.086535597259458e-05, "loss": 6.1106, "num_input_tokens_seen": 844416, "step": 1460 }, { "epoch": 0.2182007745010426, "grad_norm": 1.6991252899169922, "learning_rate": 1.0902591599642538e-05, "loss": 5.8118, "num_input_tokens_seen": 847168, "step": 1465 }, { "epoch": 0.21894548704200179, "grad_norm": 2.2015812397003174, "learning_rate": 1.0939827226690498e-05, "loss": 5.921, "num_input_tokens_seen": 850176, "step": 1470 }, { "epoch": 0.219690199582961, "grad_norm": 2.436795234680176, "learning_rate": 1.0977062853738458e-05, "loss": 5.8002, "num_input_tokens_seen": 852992, "step": 1475 }, { "epoch": 0.22043491212392016, "grad_norm": 1.9779490232467651, "learning_rate": 1.1014298480786418e-05, "loss": 6.0247, "num_input_tokens_seen": 855968, "step": 1480 }, { "epoch": 0.22117962466487937, "grad_norm": 2.3187365531921387, "learning_rate": 1.1051534107834378e-05, "loss": 5.7129, "num_input_tokens_seen": 859136, "step": 1485 }, { "epoch": 0.22192433720583854, "grad_norm": 2.333591938018799, "learning_rate": 1.1088769734882336e-05, "loss": 5.6503, "num_input_tokens_seen": 862144, "step": 1490 }, { "epoch": 0.22266904974679774, "grad_norm": 2.465492010116577, "learning_rate": 1.1126005361930296e-05, "loss": 5.848, "num_input_tokens_seen": 865440, "step": 1495 }, { "epoch": 0.22341376228775692, "grad_norm": 2.144040107727051, "learning_rate": 1.1163240988978255e-05, "loss": 5.7051, "num_input_tokens_seen": 868640, "step": 1500 }, { "epoch": 0.22415847482871612, "grad_norm": 2.1178700923919678, "learning_rate": 1.1200476616026215e-05, "loss": 5.982, "num_input_tokens_seen": 871520, "step": 1505 }, { "epoch": 0.2249031873696753, "grad_norm": 2.300807476043701, "learning_rate": 1.1237712243074173e-05, "loss": 5.9687, "num_input_tokens_seen": 874400, "step": 1510 }, { "epoch": 0.2256478999106345, "grad_norm": 2.152510166168213, "learning_rate": 1.1274947870122133e-05, "loss": 5.4874, "num_input_tokens_seen": 877600, "step": 1515 }, { "epoch": 0.22639261245159367, "grad_norm": 1.856832504272461, "learning_rate": 1.1312183497170093e-05, "loss": 5.3149, "num_input_tokens_seen": 880544, "step": 1520 }, { "epoch": 0.22713732499255287, "grad_norm": 2.176612138748169, "learning_rate": 1.1349419124218053e-05, "loss": 5.4095, "num_input_tokens_seen": 883488, "step": 1525 }, { "epoch": 0.22788203753351208, "grad_norm": 1.8558062314987183, "learning_rate": 1.1386654751266011e-05, "loss": 5.2562, "num_input_tokens_seen": 886208, "step": 1530 }, { "epoch": 0.22862675007447125, "grad_norm": 2.738245725631714, "learning_rate": 1.1423890378313971e-05, "loss": 5.3559, "num_input_tokens_seen": 889088, "step": 1535 }, { "epoch": 0.22937146261543045, "grad_norm": 2.081664800643921, "learning_rate": 1.1461126005361931e-05, "loss": 4.9598, "num_input_tokens_seen": 891776, "step": 1540 }, { "epoch": 0.23011617515638963, "grad_norm": 2.006808042526245, "learning_rate": 1.149836163240989e-05, "loss": 5.6129, "num_input_tokens_seen": 894944, "step": 1545 }, { "epoch": 0.23086088769734883, "grad_norm": 2.1512532234191895, "learning_rate": 1.153559725945785e-05, "loss": 5.1798, "num_input_tokens_seen": 897824, "step": 1550 }, { "epoch": 0.231605600238308, "grad_norm": 2.146618604660034, "learning_rate": 1.1572832886505809e-05, "loss": 4.9225, "num_input_tokens_seen": 900640, "step": 1555 }, { "epoch": 0.2323503127792672, "grad_norm": 1.9377002716064453, "learning_rate": 1.1610068513553769e-05, "loss": 5.1011, "num_input_tokens_seen": 903424, "step": 1560 }, { "epoch": 0.23309502532022638, "grad_norm": 2.1119284629821777, "learning_rate": 1.1647304140601728e-05, "loss": 4.8027, "num_input_tokens_seen": 906048, "step": 1565 }, { "epoch": 0.2338397378611856, "grad_norm": 2.483553886413574, "learning_rate": 1.1684539767649688e-05, "loss": 5.2488, "num_input_tokens_seen": 908896, "step": 1570 }, { "epoch": 0.23458445040214476, "grad_norm": 2.145702600479126, "learning_rate": 1.1721775394697646e-05, "loss": 5.0716, "num_input_tokens_seen": 911776, "step": 1575 }, { "epoch": 0.23532916294310396, "grad_norm": 2.3045175075531006, "learning_rate": 1.1759011021745606e-05, "loss": 5.1548, "num_input_tokens_seen": 915040, "step": 1580 }, { "epoch": 0.23607387548406314, "grad_norm": 2.1255102157592773, "learning_rate": 1.1796246648793566e-05, "loss": 5.2823, "num_input_tokens_seen": 917952, "step": 1585 }, { "epoch": 0.23681858802502234, "grad_norm": 2.1454765796661377, "learning_rate": 1.1833482275841526e-05, "loss": 4.9906, "num_input_tokens_seen": 920640, "step": 1590 }, { "epoch": 0.23756330056598154, "grad_norm": 2.1216299533843994, "learning_rate": 1.1870717902889484e-05, "loss": 5.0665, "num_input_tokens_seen": 923680, "step": 1595 }, { "epoch": 0.23830801310694072, "grad_norm": 2.174565076828003, "learning_rate": 1.1907953529937444e-05, "loss": 4.9688, "num_input_tokens_seen": 926560, "step": 1600 }, { "epoch": 0.23905272564789992, "grad_norm": 2.273591995239258, "learning_rate": 1.1945189156985404e-05, "loss": 4.8797, "num_input_tokens_seen": 929376, "step": 1605 }, { "epoch": 0.2397974381888591, "grad_norm": 2.1234607696533203, "learning_rate": 1.1982424784033364e-05, "loss": 4.8498, "num_input_tokens_seen": 932160, "step": 1610 }, { "epoch": 0.2405421507298183, "grad_norm": 1.711399793624878, "learning_rate": 1.2019660411081324e-05, "loss": 4.5326, "num_input_tokens_seen": 935040, "step": 1615 }, { "epoch": 0.24128686327077747, "grad_norm": 1.9587254524230957, "learning_rate": 1.2056896038129282e-05, "loss": 4.4323, "num_input_tokens_seen": 937824, "step": 1620 }, { "epoch": 0.24203157581173668, "grad_norm": 2.373835325241089, "learning_rate": 1.2094131665177243e-05, "loss": 4.8082, "num_input_tokens_seen": 940832, "step": 1625 }, { "epoch": 0.24277628835269585, "grad_norm": 2.1774230003356934, "learning_rate": 1.2131367292225201e-05, "loss": 4.8094, "num_input_tokens_seen": 943520, "step": 1630 }, { "epoch": 0.24352100089365505, "grad_norm": 2.114117383956909, "learning_rate": 1.2168602919273161e-05, "loss": 4.7407, "num_input_tokens_seen": 946400, "step": 1635 }, { "epoch": 0.24426571343461423, "grad_norm": 2.2343578338623047, "learning_rate": 1.220583854632112e-05, "loss": 4.6662, "num_input_tokens_seen": 949504, "step": 1640 }, { "epoch": 0.24501042597557343, "grad_norm": 2.118962526321411, "learning_rate": 1.2243074173369081e-05, "loss": 4.7041, "num_input_tokens_seen": 953184, "step": 1645 }, { "epoch": 0.2457551385165326, "grad_norm": 2.3218774795532227, "learning_rate": 1.228030980041704e-05, "loss": 4.3236, "num_input_tokens_seen": 956256, "step": 1650 }, { "epoch": 0.2464998510574918, "grad_norm": 2.104234218597412, "learning_rate": 1.2317545427464999e-05, "loss": 4.4012, "num_input_tokens_seen": 959168, "step": 1655 }, { "epoch": 0.247244563598451, "grad_norm": 2.1568896770477295, "learning_rate": 1.2354781054512959e-05, "loss": 4.4878, "num_input_tokens_seen": 962016, "step": 1660 }, { "epoch": 0.2479892761394102, "grad_norm": 2.1595075130462646, "learning_rate": 1.2392016681560919e-05, "loss": 4.2619, "num_input_tokens_seen": 964896, "step": 1665 }, { "epoch": 0.2487339886803694, "grad_norm": 2.3548309803009033, "learning_rate": 1.2429252308608877e-05, "loss": 4.1734, "num_input_tokens_seen": 967712, "step": 1670 }, { "epoch": 0.24947870122132856, "grad_norm": 2.11722993850708, "learning_rate": 1.2466487935656837e-05, "loss": 4.2216, "num_input_tokens_seen": 970752, "step": 1675 }, { "epoch": 0.25022341376228774, "grad_norm": 1.8261165618896484, "learning_rate": 1.2503723562704797e-05, "loss": 4.4554, "num_input_tokens_seen": 974080, "step": 1680 }, { "epoch": 0.25096812630324694, "grad_norm": 2.310699701309204, "learning_rate": 1.2540959189752758e-05, "loss": 4.008, "num_input_tokens_seen": 976960, "step": 1685 }, { "epoch": 0.25171283884420614, "grad_norm": 2.5496437549591064, "learning_rate": 1.2578194816800715e-05, "loss": 4.4262, "num_input_tokens_seen": 980192, "step": 1690 }, { "epoch": 0.25245755138516535, "grad_norm": 1.8178335428237915, "learning_rate": 1.2615430443848674e-05, "loss": 3.9317, "num_input_tokens_seen": 982944, "step": 1695 }, { "epoch": 0.2532022639261245, "grad_norm": 1.82365083694458, "learning_rate": 1.2652666070896634e-05, "loss": 3.7271, "num_input_tokens_seen": 985696, "step": 1700 }, { "epoch": 0.2539469764670837, "grad_norm": 2.2795302867889404, "learning_rate": 1.2689901697944596e-05, "loss": 3.7452, "num_input_tokens_seen": 988480, "step": 1705 }, { "epoch": 0.2546916890080429, "grad_norm": 2.0120627880096436, "learning_rate": 1.2727137324992552e-05, "loss": 3.7087, "num_input_tokens_seen": 991392, "step": 1710 }, { "epoch": 0.2554364015490021, "grad_norm": 2.113953113555908, "learning_rate": 1.2764372952040512e-05, "loss": 3.9533, "num_input_tokens_seen": 994048, "step": 1715 }, { "epoch": 0.25618111408996125, "grad_norm": 2.1110692024230957, "learning_rate": 1.2801608579088472e-05, "loss": 3.7556, "num_input_tokens_seen": 996576, "step": 1720 }, { "epoch": 0.25692582663092045, "grad_norm": 2.3094258308410645, "learning_rate": 1.2838844206136434e-05, "loss": 3.7612, "num_input_tokens_seen": 999616, "step": 1725 }, { "epoch": 0.25767053917187965, "grad_norm": 2.2187602519989014, "learning_rate": 1.287607983318439e-05, "loss": 3.7568, "num_input_tokens_seen": 1002720, "step": 1730 }, { "epoch": 0.25841525171283886, "grad_norm": 2.2565386295318604, "learning_rate": 1.291331546023235e-05, "loss": 3.9223, "num_input_tokens_seen": 1005440, "step": 1735 }, { "epoch": 0.25915996425379806, "grad_norm": 2.1787664890289307, "learning_rate": 1.295055108728031e-05, "loss": 3.2723, "num_input_tokens_seen": 1007936, "step": 1740 }, { "epoch": 0.2599046767947572, "grad_norm": 2.0584473609924316, "learning_rate": 1.2987786714328271e-05, "loss": 3.4408, "num_input_tokens_seen": 1010848, "step": 1745 }, { "epoch": 0.2606493893357164, "grad_norm": 2.2564525604248047, "learning_rate": 1.3025022341376231e-05, "loss": 3.483, "num_input_tokens_seen": 1013664, "step": 1750 }, { "epoch": 0.2613941018766756, "grad_norm": 1.8954851627349854, "learning_rate": 1.3062257968424188e-05, "loss": 3.8377, "num_input_tokens_seen": 1016800, "step": 1755 }, { "epoch": 0.2621388144176348, "grad_norm": 1.772100806236267, "learning_rate": 1.3099493595472147e-05, "loss": 3.3771, "num_input_tokens_seen": 1019648, "step": 1760 }, { "epoch": 0.26288352695859396, "grad_norm": 2.303882122039795, "learning_rate": 1.3136729222520109e-05, "loss": 3.4546, "num_input_tokens_seen": 1022432, "step": 1765 }, { "epoch": 0.26362823949955316, "grad_norm": 2.6440579891204834, "learning_rate": 1.3173964849568069e-05, "loss": 3.4652, "num_input_tokens_seen": 1025472, "step": 1770 }, { "epoch": 0.26437295204051237, "grad_norm": 2.455634117126465, "learning_rate": 1.3211200476616025e-05, "loss": 3.5171, "num_input_tokens_seen": 1028288, "step": 1775 }, { "epoch": 0.26511766458147157, "grad_norm": 1.8766182661056519, "learning_rate": 1.3248436103663985e-05, "loss": 3.2886, "num_input_tokens_seen": 1031040, "step": 1780 }, { "epoch": 0.2658623771224307, "grad_norm": 2.0706112384796143, "learning_rate": 1.3285671730711947e-05, "loss": 3.0507, "num_input_tokens_seen": 1033792, "step": 1785 }, { "epoch": 0.2666070896633899, "grad_norm": 2.1233110427856445, "learning_rate": 1.3322907357759907e-05, "loss": 2.9836, "num_input_tokens_seen": 1036448, "step": 1790 }, { "epoch": 0.2673518022043491, "grad_norm": 2.1404995918273926, "learning_rate": 1.3360142984807863e-05, "loss": 3.0825, "num_input_tokens_seen": 1039168, "step": 1795 }, { "epoch": 0.2680965147453083, "grad_norm": 2.0489389896392822, "learning_rate": 1.3397378611855823e-05, "loss": 3.0927, "num_input_tokens_seen": 1041952, "step": 1800 }, { "epoch": 0.2688412272862675, "grad_norm": 2.2032887935638428, "learning_rate": 1.3434614238903784e-05, "loss": 3.0627, "num_input_tokens_seen": 1044608, "step": 1805 }, { "epoch": 0.2695859398272267, "grad_norm": 1.824331283569336, "learning_rate": 1.3471849865951744e-05, "loss": 2.9069, "num_input_tokens_seen": 1047552, "step": 1810 }, { "epoch": 0.2703306523681859, "grad_norm": 2.4308481216430664, "learning_rate": 1.3509085492999704e-05, "loss": 3.1612, "num_input_tokens_seen": 1051200, "step": 1815 }, { "epoch": 0.2710753649091451, "grad_norm": 2.1079201698303223, "learning_rate": 1.354632112004766e-05, "loss": 2.779, "num_input_tokens_seen": 1053888, "step": 1820 }, { "epoch": 0.2718200774501043, "grad_norm": 2.4178786277770996, "learning_rate": 1.3583556747095622e-05, "loss": 2.521, "num_input_tokens_seen": 1056544, "step": 1825 }, { "epoch": 0.2725647899910634, "grad_norm": 2.1237339973449707, "learning_rate": 1.3620792374143582e-05, "loss": 2.9227, "num_input_tokens_seen": 1059296, "step": 1830 }, { "epoch": 0.27330950253202263, "grad_norm": 2.205962657928467, "learning_rate": 1.3658028001191542e-05, "loss": 2.9058, "num_input_tokens_seen": 1062176, "step": 1835 }, { "epoch": 0.27405421507298183, "grad_norm": 2.187598466873169, "learning_rate": 1.3695263628239498e-05, "loss": 2.9644, "num_input_tokens_seen": 1064992, "step": 1840 }, { "epoch": 0.27479892761394104, "grad_norm": 2.3399295806884766, "learning_rate": 1.373249925528746e-05, "loss": 2.671, "num_input_tokens_seen": 1067872, "step": 1845 }, { "epoch": 0.2755436401549002, "grad_norm": 2.234445810317993, "learning_rate": 1.376973488233542e-05, "loss": 2.9598, "num_input_tokens_seen": 1070816, "step": 1850 }, { "epoch": 0.2762883526958594, "grad_norm": 1.9826840162277222, "learning_rate": 1.380697050938338e-05, "loss": 2.9565, "num_input_tokens_seen": 1073600, "step": 1855 }, { "epoch": 0.2770330652368186, "grad_norm": 2.0602126121520996, "learning_rate": 1.3844206136431338e-05, "loss": 2.8917, "num_input_tokens_seen": 1076512, "step": 1860 }, { "epoch": 0.2777777777777778, "grad_norm": 2.057310104370117, "learning_rate": 1.3881441763479298e-05, "loss": 2.4602, "num_input_tokens_seen": 1079264, "step": 1865 }, { "epoch": 0.278522490318737, "grad_norm": 1.950012445449829, "learning_rate": 1.3918677390527257e-05, "loss": 2.4473, "num_input_tokens_seen": 1082144, "step": 1870 }, { "epoch": 0.27926720285969614, "grad_norm": 2.0533056259155273, "learning_rate": 1.3955913017575217e-05, "loss": 2.3215, "num_input_tokens_seen": 1084960, "step": 1875 }, { "epoch": 0.28001191540065534, "grad_norm": 2.3437061309814453, "learning_rate": 1.3993148644623177e-05, "loss": 2.6754, "num_input_tokens_seen": 1087936, "step": 1880 }, { "epoch": 0.28075662794161454, "grad_norm": 1.8162267208099365, "learning_rate": 1.4030384271671135e-05, "loss": 2.3945, "num_input_tokens_seen": 1091008, "step": 1885 }, { "epoch": 0.28150134048257375, "grad_norm": 2.3793885707855225, "learning_rate": 1.4067619898719095e-05, "loss": 2.3544, "num_input_tokens_seen": 1093664, "step": 1890 }, { "epoch": 0.2822460530235329, "grad_norm": 2.295112133026123, "learning_rate": 1.4104855525767055e-05, "loss": 2.5135, "num_input_tokens_seen": 1096640, "step": 1895 }, { "epoch": 0.2829907655644921, "grad_norm": 1.8245350122451782, "learning_rate": 1.4142091152815015e-05, "loss": 2.4117, "num_input_tokens_seen": 1099200, "step": 1900 }, { "epoch": 0.2837354781054513, "grad_norm": 1.9524224996566772, "learning_rate": 1.4179326779862973e-05, "loss": 2.5307, "num_input_tokens_seen": 1102368, "step": 1905 }, { "epoch": 0.2844801906464105, "grad_norm": 1.7752220630645752, "learning_rate": 1.4216562406910933e-05, "loss": 2.2398, "num_input_tokens_seen": 1105056, "step": 1910 }, { "epoch": 0.28522490318736965, "grad_norm": 2.1771035194396973, "learning_rate": 1.4253798033958893e-05, "loss": 2.2138, "num_input_tokens_seen": 1108032, "step": 1915 }, { "epoch": 0.28596961572832885, "grad_norm": 2.1641135215759277, "learning_rate": 1.4291033661006853e-05, "loss": 2.1683, "num_input_tokens_seen": 1110880, "step": 1920 }, { "epoch": 0.28671432826928805, "grad_norm": 2.7316365242004395, "learning_rate": 1.432826928805481e-05, "loss": 2.2416, "num_input_tokens_seen": 1113568, "step": 1925 }, { "epoch": 0.28745904081024726, "grad_norm": 2.2778780460357666, "learning_rate": 1.436550491510277e-05, "loss": 2.2691, "num_input_tokens_seen": 1116416, "step": 1930 }, { "epoch": 0.28820375335120646, "grad_norm": 1.770742416381836, "learning_rate": 1.440274054215073e-05, "loss": 1.9841, "num_input_tokens_seen": 1119232, "step": 1935 }, { "epoch": 0.2889484658921656, "grad_norm": 2.235405683517456, "learning_rate": 1.443997616919869e-05, "loss": 2.3579, "num_input_tokens_seen": 1122144, "step": 1940 }, { "epoch": 0.2896931784331248, "grad_norm": 2.5629947185516357, "learning_rate": 1.447721179624665e-05, "loss": 2.0675, "num_input_tokens_seen": 1125056, "step": 1945 }, { "epoch": 0.290437890974084, "grad_norm": 2.0699100494384766, "learning_rate": 1.4514447423294608e-05, "loss": 2.0441, "num_input_tokens_seen": 1127744, "step": 1950 }, { "epoch": 0.2911826035150432, "grad_norm": 1.4894883632659912, "learning_rate": 1.4551683050342568e-05, "loss": 1.9212, "num_input_tokens_seen": 1130432, "step": 1955 }, { "epoch": 0.29192731605600236, "grad_norm": 1.8101307153701782, "learning_rate": 1.4588918677390528e-05, "loss": 1.829, "num_input_tokens_seen": 1133184, "step": 1960 }, { "epoch": 0.29267202859696156, "grad_norm": 2.211242437362671, "learning_rate": 1.4626154304438488e-05, "loss": 1.9884, "num_input_tokens_seen": 1136256, "step": 1965 }, { "epoch": 0.29341674113792077, "grad_norm": 2.1376945972442627, "learning_rate": 1.4663389931486446e-05, "loss": 1.863, "num_input_tokens_seen": 1139104, "step": 1970 }, { "epoch": 0.29416145367887997, "grad_norm": 1.6463494300842285, "learning_rate": 1.4700625558534406e-05, "loss": 1.8984, "num_input_tokens_seen": 1142176, "step": 1975 }, { "epoch": 0.2949061662198391, "grad_norm": 2.0253403186798096, "learning_rate": 1.4737861185582366e-05, "loss": 1.7177, "num_input_tokens_seen": 1144896, "step": 1980 }, { "epoch": 0.2956508787607983, "grad_norm": 1.7315335273742676, "learning_rate": 1.4775096812630326e-05, "loss": 2.0063, "num_input_tokens_seen": 1148192, "step": 1985 }, { "epoch": 0.2963955913017575, "grad_norm": 2.181551694869995, "learning_rate": 1.4812332439678284e-05, "loss": 1.5966, "num_input_tokens_seen": 1150848, "step": 1990 }, { "epoch": 0.2971403038427167, "grad_norm": 2.174860954284668, "learning_rate": 1.4849568066726244e-05, "loss": 2.0779, "num_input_tokens_seen": 1153792, "step": 1995 }, { "epoch": 0.2978850163836759, "grad_norm": 1.7176265716552734, "learning_rate": 1.4886803693774203e-05, "loss": 1.6202, "num_input_tokens_seen": 1156544, "step": 2000 }, { "epoch": 0.2986297289246351, "grad_norm": 1.8762925863265991, "learning_rate": 1.4924039320822163e-05, "loss": 1.5206, "num_input_tokens_seen": 1159360, "step": 2005 }, { "epoch": 0.2993744414655943, "grad_norm": 2.088738441467285, "learning_rate": 1.4961274947870125e-05, "loss": 1.6599, "num_input_tokens_seen": 1162240, "step": 2010 }, { "epoch": 0.3001191540065535, "grad_norm": 1.8903850317001343, "learning_rate": 1.4998510574918081e-05, "loss": 1.6083, "num_input_tokens_seen": 1165056, "step": 2015 }, { "epoch": 0.3008638665475127, "grad_norm": 1.5454847812652588, "learning_rate": 1.5035746201966041e-05, "loss": 1.4326, "num_input_tokens_seen": 1167616, "step": 2020 }, { "epoch": 0.30160857908847183, "grad_norm": 1.9378328323364258, "learning_rate": 1.5072981829014001e-05, "loss": 1.6031, "num_input_tokens_seen": 1170432, "step": 2025 }, { "epoch": 0.30235329162943103, "grad_norm": 1.508760929107666, "learning_rate": 1.5110217456061963e-05, "loss": 1.5404, "num_input_tokens_seen": 1173216, "step": 2030 }, { "epoch": 0.30309800417039023, "grad_norm": 1.4875895977020264, "learning_rate": 1.5147453083109919e-05, "loss": 1.3917, "num_input_tokens_seen": 1176192, "step": 2035 }, { "epoch": 0.30384271671134944, "grad_norm": 1.5786709785461426, "learning_rate": 1.5184688710157879e-05, "loss": 1.9424, "num_input_tokens_seen": 1179296, "step": 2040 }, { "epoch": 0.3045874292523086, "grad_norm": 1.5882951021194458, "learning_rate": 1.5221924337205839e-05, "loss": 1.5224, "num_input_tokens_seen": 1182240, "step": 2045 }, { "epoch": 0.3053321417932678, "grad_norm": 1.8552970886230469, "learning_rate": 1.52591599642538e-05, "loss": 1.5575, "num_input_tokens_seen": 1185088, "step": 2050 }, { "epoch": 0.306076854334227, "grad_norm": 1.1116312742233276, "learning_rate": 1.529639559130176e-05, "loss": 1.3657, "num_input_tokens_seen": 1188128, "step": 2055 }, { "epoch": 0.3068215668751862, "grad_norm": 1.7827011346817017, "learning_rate": 1.5333631218349718e-05, "loss": 1.4781, "num_input_tokens_seen": 1190944, "step": 2060 }, { "epoch": 0.3075662794161454, "grad_norm": 1.6192513704299927, "learning_rate": 1.5370866845397678e-05, "loss": 1.596, "num_input_tokens_seen": 1193920, "step": 2065 }, { "epoch": 0.30831099195710454, "grad_norm": 1.913422703742981, "learning_rate": 1.5408102472445638e-05, "loss": 1.5943, "num_input_tokens_seen": 1196640, "step": 2070 }, { "epoch": 0.30905570449806374, "grad_norm": 2.6497297286987305, "learning_rate": 1.5445338099493598e-05, "loss": 1.4702, "num_input_tokens_seen": 1199616, "step": 2075 }, { "epoch": 0.30980041703902295, "grad_norm": 1.5951499938964844, "learning_rate": 1.5482573726541554e-05, "loss": 1.4576, "num_input_tokens_seen": 1202368, "step": 2080 }, { "epoch": 0.31054512957998215, "grad_norm": 1.4976361989974976, "learning_rate": 1.5519809353589514e-05, "loss": 1.4845, "num_input_tokens_seen": 1205312, "step": 2085 }, { "epoch": 0.3112898421209413, "grad_norm": 2.073979616165161, "learning_rate": 1.5557044980637474e-05, "loss": 1.7954, "num_input_tokens_seen": 1208608, "step": 2090 }, { "epoch": 0.3120345546619005, "grad_norm": 1.8771330118179321, "learning_rate": 1.5594280607685434e-05, "loss": 1.587, "num_input_tokens_seen": 1211520, "step": 2095 }, { "epoch": 0.3127792672028597, "grad_norm": 2.1169614791870117, "learning_rate": 1.5631516234733394e-05, "loss": 1.6911, "num_input_tokens_seen": 1214592, "step": 2100 }, { "epoch": 0.3135239797438189, "grad_norm": 1.5659019947052002, "learning_rate": 1.5668751861781354e-05, "loss": 1.4357, "num_input_tokens_seen": 1217440, "step": 2105 }, { "epoch": 0.31426869228477805, "grad_norm": 1.6576358079910278, "learning_rate": 1.5705987488829313e-05, "loss": 1.3556, "num_input_tokens_seen": 1220416, "step": 2110 }, { "epoch": 0.31501340482573725, "grad_norm": 2.2820560932159424, "learning_rate": 1.5743223115877273e-05, "loss": 1.2275, "num_input_tokens_seen": 1222912, "step": 2115 }, { "epoch": 0.31575811736669646, "grad_norm": 1.9699771404266357, "learning_rate": 1.578045874292523e-05, "loss": 1.5058, "num_input_tokens_seen": 1225952, "step": 2120 }, { "epoch": 0.31650282990765566, "grad_norm": 2.0128748416900635, "learning_rate": 1.581769436997319e-05, "loss": 1.0311, "num_input_tokens_seen": 1228736, "step": 2125 }, { "epoch": 0.31724754244861486, "grad_norm": 1.6052312850952148, "learning_rate": 1.585492999702115e-05, "loss": 1.2848, "num_input_tokens_seen": 1231584, "step": 2130 }, { "epoch": 0.317992254989574, "grad_norm": 1.737395167350769, "learning_rate": 1.589216562406911e-05, "loss": 1.3959, "num_input_tokens_seen": 1234432, "step": 2135 }, { "epoch": 0.3187369675305332, "grad_norm": 1.601752758026123, "learning_rate": 1.5929401251117073e-05, "loss": 1.3878, "num_input_tokens_seen": 1237376, "step": 2140 }, { "epoch": 0.3194816800714924, "grad_norm": 2.119211196899414, "learning_rate": 1.596663687816503e-05, "loss": 1.4071, "num_input_tokens_seen": 1240352, "step": 2145 }, { "epoch": 0.3202263926124516, "grad_norm": 1.8155056238174438, "learning_rate": 1.600387250521299e-05, "loss": 1.3292, "num_input_tokens_seen": 1242944, "step": 2150 }, { "epoch": 0.32097110515341076, "grad_norm": 1.690055251121521, "learning_rate": 1.604110813226095e-05, "loss": 1.1305, "num_input_tokens_seen": 1245760, "step": 2155 }, { "epoch": 0.32171581769436997, "grad_norm": 1.7417428493499756, "learning_rate": 1.607834375930891e-05, "loss": 1.2967, "num_input_tokens_seen": 1248352, "step": 2160 }, { "epoch": 0.32246053023532917, "grad_norm": 1.6130472421646118, "learning_rate": 1.6115579386356865e-05, "loss": 1.2119, "num_input_tokens_seen": 1251072, "step": 2165 }, { "epoch": 0.32320524277628837, "grad_norm": 1.5688849687576294, "learning_rate": 1.6152815013404825e-05, "loss": 1.3056, "num_input_tokens_seen": 1253792, "step": 2170 }, { "epoch": 0.3239499553172475, "grad_norm": 1.1581082344055176, "learning_rate": 1.6190050640452785e-05, "loss": 1.3183, "num_input_tokens_seen": 1256512, "step": 2175 }, { "epoch": 0.3246946678582067, "grad_norm": 1.9028618335723877, "learning_rate": 1.6227286267500748e-05, "loss": 1.1565, "num_input_tokens_seen": 1259200, "step": 2180 }, { "epoch": 0.3254393803991659, "grad_norm": 1.5735512971878052, "learning_rate": 1.6264521894548704e-05, "loss": 1.3667, "num_input_tokens_seen": 1262208, "step": 2185 }, { "epoch": 0.3261840929401251, "grad_norm": 1.658909797668457, "learning_rate": 1.6301757521596664e-05, "loss": 1.1894, "num_input_tokens_seen": 1264960, "step": 2190 }, { "epoch": 0.32692880548108433, "grad_norm": 1.4375286102294922, "learning_rate": 1.6338993148644624e-05, "loss": 1.1483, "num_input_tokens_seen": 1268000, "step": 2195 }, { "epoch": 0.3276735180220435, "grad_norm": 1.9041633605957031, "learning_rate": 1.6376228775692584e-05, "loss": 1.4181, "num_input_tokens_seen": 1271296, "step": 2200 }, { "epoch": 0.3284182305630027, "grad_norm": 2.1105844974517822, "learning_rate": 1.6413464402740544e-05, "loss": 1.0598, "num_input_tokens_seen": 1274368, "step": 2205 }, { "epoch": 0.3291629431039619, "grad_norm": 1.4943361282348633, "learning_rate": 1.64507000297885e-05, "loss": 1.2826, "num_input_tokens_seen": 1277216, "step": 2210 }, { "epoch": 0.3299076556449211, "grad_norm": 1.4837372303009033, "learning_rate": 1.648793565683646e-05, "loss": 1.1493, "num_input_tokens_seen": 1280064, "step": 2215 }, { "epoch": 0.33065236818588023, "grad_norm": 1.9629344940185547, "learning_rate": 1.6525171283884423e-05, "loss": 1.1807, "num_input_tokens_seen": 1282880, "step": 2220 }, { "epoch": 0.33139708072683943, "grad_norm": 1.7236604690551758, "learning_rate": 1.6562406910932383e-05, "loss": 1.1562, "num_input_tokens_seen": 1285856, "step": 2225 }, { "epoch": 0.33214179326779864, "grad_norm": 1.819923996925354, "learning_rate": 1.659964253798034e-05, "loss": 1.0202, "num_input_tokens_seen": 1288576, "step": 2230 }, { "epoch": 0.33288650580875784, "grad_norm": 1.9570730924606323, "learning_rate": 1.66368781650283e-05, "loss": 1.19, "num_input_tokens_seen": 1291360, "step": 2235 }, { "epoch": 0.333631218349717, "grad_norm": 1.6427043676376343, "learning_rate": 1.667411379207626e-05, "loss": 1.132, "num_input_tokens_seen": 1294368, "step": 2240 }, { "epoch": 0.3343759308906762, "grad_norm": 1.3371925354003906, "learning_rate": 1.671134941912422e-05, "loss": 1.0933, "num_input_tokens_seen": 1297152, "step": 2245 }, { "epoch": 0.3351206434316354, "grad_norm": 1.6675834655761719, "learning_rate": 1.6748585046172176e-05, "loss": 1.3801, "num_input_tokens_seen": 1300640, "step": 2250 }, { "epoch": 0.3358653559725946, "grad_norm": 1.4136055707931519, "learning_rate": 1.6785820673220136e-05, "loss": 1.1254, "num_input_tokens_seen": 1303488, "step": 2255 }, { "epoch": 0.3366100685135538, "grad_norm": 1.8907856941223145, "learning_rate": 1.68230563002681e-05, "loss": 1.0701, "num_input_tokens_seen": 1306528, "step": 2260 }, { "epoch": 0.33735478105451294, "grad_norm": 1.5764060020446777, "learning_rate": 1.686029192731606e-05, "loss": 1.4708, "num_input_tokens_seen": 1309792, "step": 2265 }, { "epoch": 0.33809949359547214, "grad_norm": 1.8346232175827026, "learning_rate": 1.689752755436402e-05, "loss": 1.0202, "num_input_tokens_seen": 1312448, "step": 2270 }, { "epoch": 0.33884420613643135, "grad_norm": 1.5014249086380005, "learning_rate": 1.6934763181411975e-05, "loss": 1.1756, "num_input_tokens_seen": 1315136, "step": 2275 }, { "epoch": 0.33958891867739055, "grad_norm": 1.6847363710403442, "learning_rate": 1.6971998808459935e-05, "loss": 1.2035, "num_input_tokens_seen": 1318144, "step": 2280 }, { "epoch": 0.3403336312183497, "grad_norm": 1.5848220586776733, "learning_rate": 1.7009234435507895e-05, "loss": 0.9999, "num_input_tokens_seen": 1320992, "step": 2285 }, { "epoch": 0.3410783437593089, "grad_norm": 1.6695587635040283, "learning_rate": 1.7046470062555855e-05, "loss": 1.0894, "num_input_tokens_seen": 1323808, "step": 2290 }, { "epoch": 0.3418230563002681, "grad_norm": 1.6204088926315308, "learning_rate": 1.7083705689603814e-05, "loss": 0.9584, "num_input_tokens_seen": 1326400, "step": 2295 }, { "epoch": 0.3425677688412273, "grad_norm": 1.2892577648162842, "learning_rate": 1.7120941316651774e-05, "loss": 0.9892, "num_input_tokens_seen": 1329184, "step": 2300 }, { "epoch": 0.34331248138218645, "grad_norm": 1.2773076295852661, "learning_rate": 1.7158176943699734e-05, "loss": 1.0157, "num_input_tokens_seen": 1332224, "step": 2305 }, { "epoch": 0.34405719392314565, "grad_norm": 1.167803168296814, "learning_rate": 1.7195412570747694e-05, "loss": 0.9671, "num_input_tokens_seen": 1335168, "step": 2310 }, { "epoch": 0.34480190646410486, "grad_norm": 1.7468788623809814, "learning_rate": 1.723264819779565e-05, "loss": 1.0704, "num_input_tokens_seen": 1338016, "step": 2315 }, { "epoch": 0.34554661900506406, "grad_norm": 1.167558193206787, "learning_rate": 1.726988382484361e-05, "loss": 1.0827, "num_input_tokens_seen": 1340640, "step": 2320 }, { "epoch": 0.34629133154602326, "grad_norm": 2.881779193878174, "learning_rate": 1.730711945189157e-05, "loss": 1.368, "num_input_tokens_seen": 1343648, "step": 2325 }, { "epoch": 0.3470360440869824, "grad_norm": 1.3402376174926758, "learning_rate": 1.734435507893953e-05, "loss": 1.0717, "num_input_tokens_seen": 1346464, "step": 2330 }, { "epoch": 0.3477807566279416, "grad_norm": 1.8438957929611206, "learning_rate": 1.738159070598749e-05, "loss": 1.1592, "num_input_tokens_seen": 1349408, "step": 2335 }, { "epoch": 0.3485254691689008, "grad_norm": 1.5102512836456299, "learning_rate": 1.741882633303545e-05, "loss": 1.096, "num_input_tokens_seen": 1352288, "step": 2340 }, { "epoch": 0.34927018170986, "grad_norm": 2.2932651042938232, "learning_rate": 1.745606196008341e-05, "loss": 1.1133, "num_input_tokens_seen": 1354944, "step": 2345 }, { "epoch": 0.35001489425081916, "grad_norm": 1.5305975675582886, "learning_rate": 1.749329758713137e-05, "loss": 1.1098, "num_input_tokens_seen": 1358048, "step": 2350 }, { "epoch": 0.35075960679177837, "grad_norm": 1.6989938020706177, "learning_rate": 1.753053321417933e-05, "loss": 1.0075, "num_input_tokens_seen": 1360704, "step": 2355 }, { "epoch": 0.35150431933273757, "grad_norm": 2.62186598777771, "learning_rate": 1.7567768841227286e-05, "loss": 1.0832, "num_input_tokens_seen": 1363232, "step": 2360 }, { "epoch": 0.35224903187369677, "grad_norm": 1.8099082708358765, "learning_rate": 1.7605004468275246e-05, "loss": 1.2044, "num_input_tokens_seen": 1366272, "step": 2365 }, { "epoch": 0.3529937444146559, "grad_norm": 1.2809104919433594, "learning_rate": 1.7642240095323205e-05, "loss": 0.9145, "num_input_tokens_seen": 1369024, "step": 2370 }, { "epoch": 0.3537384569556151, "grad_norm": 1.754508376121521, "learning_rate": 1.7679475722371165e-05, "loss": 1.07, "num_input_tokens_seen": 1371936, "step": 2375 }, { "epoch": 0.3544831694965743, "grad_norm": 1.502811074256897, "learning_rate": 1.7716711349419125e-05, "loss": 1.2452, "num_input_tokens_seen": 1374976, "step": 2380 }, { "epoch": 0.3552278820375335, "grad_norm": 1.3855011463165283, "learning_rate": 1.7753946976467085e-05, "loss": 0.9996, "num_input_tokens_seen": 1378208, "step": 2385 }, { "epoch": 0.35597259457849273, "grad_norm": 1.6435964107513428, "learning_rate": 1.7791182603515045e-05, "loss": 1.0976, "num_input_tokens_seen": 1381056, "step": 2390 }, { "epoch": 0.3567173071194519, "grad_norm": 1.4176504611968994, "learning_rate": 1.7828418230563005e-05, "loss": 1.1706, "num_input_tokens_seen": 1384384, "step": 2395 }, { "epoch": 0.3574620196604111, "grad_norm": 1.6839343309402466, "learning_rate": 1.786565385761096e-05, "loss": 0.913, "num_input_tokens_seen": 1387200, "step": 2400 }, { "epoch": 0.3582067322013703, "grad_norm": 1.9347881078720093, "learning_rate": 1.790288948465892e-05, "loss": 1.0272, "num_input_tokens_seen": 1390272, "step": 2405 }, { "epoch": 0.3589514447423295, "grad_norm": 1.568557858467102, "learning_rate": 1.794012511170688e-05, "loss": 1.0796, "num_input_tokens_seen": 1393408, "step": 2410 }, { "epoch": 0.35969615728328863, "grad_norm": 2.1734368801116943, "learning_rate": 1.797736073875484e-05, "loss": 1.1858, "num_input_tokens_seen": 1396352, "step": 2415 }, { "epoch": 0.36044086982424783, "grad_norm": 1.8492920398712158, "learning_rate": 1.80145963658028e-05, "loss": 1.1251, "num_input_tokens_seen": 1399328, "step": 2420 }, { "epoch": 0.36118558236520704, "grad_norm": 2.7254605293273926, "learning_rate": 1.805183199285076e-05, "loss": 1.0791, "num_input_tokens_seen": 1402080, "step": 2425 }, { "epoch": 0.36193029490616624, "grad_norm": 1.4725046157836914, "learning_rate": 1.808906761989872e-05, "loss": 0.9747, "num_input_tokens_seen": 1404576, "step": 2430 }, { "epoch": 0.3626750074471254, "grad_norm": 1.609421730041504, "learning_rate": 1.812630324694668e-05, "loss": 0.9184, "num_input_tokens_seen": 1407552, "step": 2435 }, { "epoch": 0.3634197199880846, "grad_norm": 1.6476330757141113, "learning_rate": 1.816353887399464e-05, "loss": 0.9129, "num_input_tokens_seen": 1410272, "step": 2440 }, { "epoch": 0.3641644325290438, "grad_norm": 1.7688223123550415, "learning_rate": 1.8200774501042596e-05, "loss": 1.1621, "num_input_tokens_seen": 1412992, "step": 2445 }, { "epoch": 0.364909145070003, "grad_norm": 1.4233447313308716, "learning_rate": 1.8238010128090556e-05, "loss": 0.9711, "num_input_tokens_seen": 1415840, "step": 2450 }, { "epoch": 0.3656538576109622, "grad_norm": 1.5354080200195312, "learning_rate": 1.8275245755138516e-05, "loss": 0.9583, "num_input_tokens_seen": 1418560, "step": 2455 }, { "epoch": 0.36639857015192134, "grad_norm": 1.4111666679382324, "learning_rate": 1.8312481382186476e-05, "loss": 1.0167, "num_input_tokens_seen": 1421664, "step": 2460 }, { "epoch": 0.36714328269288055, "grad_norm": 1.6558030843734741, "learning_rate": 1.8349717009234436e-05, "loss": 1.0281, "num_input_tokens_seen": 1424512, "step": 2465 }, { "epoch": 0.36788799523383975, "grad_norm": 1.217509150505066, "learning_rate": 1.8386952636282396e-05, "loss": 0.9909, "num_input_tokens_seen": 1427392, "step": 2470 }, { "epoch": 0.36863270777479895, "grad_norm": 1.7795865535736084, "learning_rate": 1.8424188263330356e-05, "loss": 0.9807, "num_input_tokens_seen": 1430048, "step": 2475 }, { "epoch": 0.3693774203157581, "grad_norm": 2.5249032974243164, "learning_rate": 1.8461423890378315e-05, "loss": 1.0773, "num_input_tokens_seen": 1432704, "step": 2480 }, { "epoch": 0.3701221328567173, "grad_norm": 1.3847683668136597, "learning_rate": 1.8498659517426275e-05, "loss": 0.9061, "num_input_tokens_seen": 1435488, "step": 2485 }, { "epoch": 0.3708668453976765, "grad_norm": 1.5209673643112183, "learning_rate": 1.8535895144474232e-05, "loss": 0.9633, "num_input_tokens_seen": 1438176, "step": 2490 }, { "epoch": 0.3716115579386357, "grad_norm": 1.6287862062454224, "learning_rate": 1.857313077152219e-05, "loss": 1.0202, "num_input_tokens_seen": 1440800, "step": 2495 }, { "epoch": 0.37235627047959485, "grad_norm": 1.74672532081604, "learning_rate": 1.8610366398570155e-05, "loss": 1.1705, "num_input_tokens_seen": 1443808, "step": 2500 }, { "epoch": 0.37310098302055406, "grad_norm": 1.6729695796966553, "learning_rate": 1.8647602025618115e-05, "loss": 0.9963, "num_input_tokens_seen": 1446752, "step": 2505 }, { "epoch": 0.37384569556151326, "grad_norm": 1.6077556610107422, "learning_rate": 1.868483765266607e-05, "loss": 0.9073, "num_input_tokens_seen": 1449792, "step": 2510 }, { "epoch": 0.37459040810247246, "grad_norm": 1.3524081707000732, "learning_rate": 1.872207327971403e-05, "loss": 1.0188, "num_input_tokens_seen": 1452672, "step": 2515 }, { "epoch": 0.3753351206434316, "grad_norm": 1.5686630010604858, "learning_rate": 1.875930890676199e-05, "loss": 1.0498, "num_input_tokens_seen": 1455456, "step": 2520 }, { "epoch": 0.3760798331843908, "grad_norm": 1.4310308694839478, "learning_rate": 1.879654453380995e-05, "loss": 0.9628, "num_input_tokens_seen": 1458208, "step": 2525 }, { "epoch": 0.37682454572535, "grad_norm": 1.7424920797348022, "learning_rate": 1.8833780160857907e-05, "loss": 1.0026, "num_input_tokens_seen": 1461024, "step": 2530 }, { "epoch": 0.3775692582663092, "grad_norm": 1.292934775352478, "learning_rate": 1.8871015787905867e-05, "loss": 0.8984, "num_input_tokens_seen": 1463776, "step": 2535 }, { "epoch": 0.3783139708072684, "grad_norm": 1.4134544134140015, "learning_rate": 1.890825141495383e-05, "loss": 0.9248, "num_input_tokens_seen": 1466688, "step": 2540 }, { "epoch": 0.37905868334822757, "grad_norm": 1.2777390480041504, "learning_rate": 1.894548704200179e-05, "loss": 1.0114, "num_input_tokens_seen": 1469536, "step": 2545 }, { "epoch": 0.37980339588918677, "grad_norm": 1.5101903676986694, "learning_rate": 1.898272266904975e-05, "loss": 0.9572, "num_input_tokens_seen": 1472096, "step": 2550 }, { "epoch": 0.38054810843014597, "grad_norm": 1.288270354270935, "learning_rate": 1.9019958296097706e-05, "loss": 0.9644, "num_input_tokens_seen": 1475008, "step": 2555 }, { "epoch": 0.3812928209711052, "grad_norm": 1.3224650621414185, "learning_rate": 1.9057193923145666e-05, "loss": 0.9718, "num_input_tokens_seen": 1477664, "step": 2560 }, { "epoch": 0.3820375335120643, "grad_norm": 1.2587172985076904, "learning_rate": 1.9094429550193626e-05, "loss": 0.8934, "num_input_tokens_seen": 1480576, "step": 2565 }, { "epoch": 0.3827822460530235, "grad_norm": 1.7469449043273926, "learning_rate": 1.9131665177241586e-05, "loss": 1.0137, "num_input_tokens_seen": 1483488, "step": 2570 }, { "epoch": 0.3835269585939827, "grad_norm": 1.416097640991211, "learning_rate": 1.9168900804289542e-05, "loss": 0.9551, "num_input_tokens_seen": 1486368, "step": 2575 }, { "epoch": 0.38427167113494193, "grad_norm": 2.138904333114624, "learning_rate": 1.9206136431337506e-05, "loss": 0.894, "num_input_tokens_seen": 1489216, "step": 2580 }, { "epoch": 0.3850163836759011, "grad_norm": 1.6133038997650146, "learning_rate": 1.9243372058385466e-05, "loss": 0.8604, "num_input_tokens_seen": 1492032, "step": 2585 }, { "epoch": 0.3857610962168603, "grad_norm": 1.7386854887008667, "learning_rate": 1.9280607685433425e-05, "loss": 0.9414, "num_input_tokens_seen": 1494784, "step": 2590 }, { "epoch": 0.3865058087578195, "grad_norm": 1.2429965734481812, "learning_rate": 1.9317843312481382e-05, "loss": 0.8197, "num_input_tokens_seen": 1497856, "step": 2595 }, { "epoch": 0.3872505212987787, "grad_norm": 1.8841451406478882, "learning_rate": 1.9355078939529342e-05, "loss": 0.8572, "num_input_tokens_seen": 1500512, "step": 2600 }, { "epoch": 0.3879952338397379, "grad_norm": 1.4199172258377075, "learning_rate": 1.93923145665773e-05, "loss": 0.9649, "num_input_tokens_seen": 1503328, "step": 2605 }, { "epoch": 0.38873994638069703, "grad_norm": 1.413501262664795, "learning_rate": 1.942955019362526e-05, "loss": 1.0101, "num_input_tokens_seen": 1506016, "step": 2610 }, { "epoch": 0.38948465892165623, "grad_norm": 1.8393019437789917, "learning_rate": 1.946678582067322e-05, "loss": 1.0167, "num_input_tokens_seen": 1508736, "step": 2615 }, { "epoch": 0.39022937146261544, "grad_norm": 2.0174221992492676, "learning_rate": 1.950402144772118e-05, "loss": 1.3299, "num_input_tokens_seen": 1512896, "step": 2620 }, { "epoch": 0.39097408400357464, "grad_norm": 2.3330917358398438, "learning_rate": 1.954125707476914e-05, "loss": 1.014, "num_input_tokens_seen": 1515712, "step": 2625 }, { "epoch": 0.3917187965445338, "grad_norm": 1.9176461696624756, "learning_rate": 1.95784927018171e-05, "loss": 0.9319, "num_input_tokens_seen": 1518560, "step": 2630 }, { "epoch": 0.392463509085493, "grad_norm": 1.5908892154693604, "learning_rate": 1.961572832886506e-05, "loss": 0.8707, "num_input_tokens_seen": 1521536, "step": 2635 }, { "epoch": 0.3932082216264522, "grad_norm": 1.6490566730499268, "learning_rate": 1.9652963955913017e-05, "loss": 1.0594, "num_input_tokens_seen": 1524544, "step": 2640 }, { "epoch": 0.3939529341674114, "grad_norm": 1.879272699356079, "learning_rate": 1.9690199582960977e-05, "loss": 0.9004, "num_input_tokens_seen": 1527744, "step": 2645 }, { "epoch": 0.39469764670837054, "grad_norm": 1.6244432926177979, "learning_rate": 1.9727435210008937e-05, "loss": 0.9602, "num_input_tokens_seen": 1530592, "step": 2650 }, { "epoch": 0.39544235924932974, "grad_norm": 1.622620701789856, "learning_rate": 1.9764670837056897e-05, "loss": 0.9726, "num_input_tokens_seen": 1533568, "step": 2655 }, { "epoch": 0.39618707179028895, "grad_norm": 1.4392982721328735, "learning_rate": 1.9801906464104857e-05, "loss": 1.0019, "num_input_tokens_seen": 1536512, "step": 2660 }, { "epoch": 0.39693178433124815, "grad_norm": 1.6166670322418213, "learning_rate": 1.9839142091152816e-05, "loss": 0.924, "num_input_tokens_seen": 1539520, "step": 2665 }, { "epoch": 0.39767649687220735, "grad_norm": 1.6220943927764893, "learning_rate": 1.9876377718200776e-05, "loss": 0.9093, "num_input_tokens_seen": 1542464, "step": 2670 }, { "epoch": 0.3984212094131665, "grad_norm": 1.416454792022705, "learning_rate": 1.9913613345248736e-05, "loss": 0.8314, "num_input_tokens_seen": 1545344, "step": 2675 }, { "epoch": 0.3991659219541257, "grad_norm": 1.9738060235977173, "learning_rate": 1.9950848972296696e-05, "loss": 0.9784, "num_input_tokens_seen": 1548256, "step": 2680 }, { "epoch": 0.3999106344950849, "grad_norm": 1.60405695438385, "learning_rate": 1.9988084599344652e-05, "loss": 1.016, "num_input_tokens_seen": 1551328, "step": 2685 }, { "epoch": 0.4006553470360441, "grad_norm": 1.7098960876464844, "learning_rate": 2.0025320226392612e-05, "loss": 0.9592, "num_input_tokens_seen": 1554048, "step": 2690 }, { "epoch": 0.40140005957700325, "grad_norm": 1.6417254209518433, "learning_rate": 2.0062555853440572e-05, "loss": 0.8843, "num_input_tokens_seen": 1556704, "step": 2695 }, { "epoch": 0.40214477211796246, "grad_norm": 1.4768264293670654, "learning_rate": 2.0099791480488532e-05, "loss": 0.9702, "num_input_tokens_seen": 1559680, "step": 2700 }, { "epoch": 0.40288948465892166, "grad_norm": 1.712859034538269, "learning_rate": 2.0137027107536492e-05, "loss": 0.9815, "num_input_tokens_seen": 1562464, "step": 2705 }, { "epoch": 0.40363419719988086, "grad_norm": 1.513818621635437, "learning_rate": 2.0174262734584452e-05, "loss": 0.9004, "num_input_tokens_seen": 1565376, "step": 2710 }, { "epoch": 0.40437890974084, "grad_norm": 1.8518719673156738, "learning_rate": 2.021149836163241e-05, "loss": 0.9911, "num_input_tokens_seen": 1568256, "step": 2715 }, { "epoch": 0.4051236222817992, "grad_norm": 1.6745103597640991, "learning_rate": 2.024873398868037e-05, "loss": 1.0692, "num_input_tokens_seen": 1571424, "step": 2720 }, { "epoch": 0.4058683348227584, "grad_norm": 1.5973478555679321, "learning_rate": 2.0285969615728328e-05, "loss": 0.9453, "num_input_tokens_seen": 1574112, "step": 2725 }, { "epoch": 0.4066130473637176, "grad_norm": 1.0979008674621582, "learning_rate": 2.0323205242776288e-05, "loss": 0.9413, "num_input_tokens_seen": 1577024, "step": 2730 }, { "epoch": 0.4073577599046768, "grad_norm": 1.299804449081421, "learning_rate": 2.0360440869824248e-05, "loss": 1.0089, "num_input_tokens_seen": 1579808, "step": 2735 }, { "epoch": 0.40810247244563597, "grad_norm": 1.9276363849639893, "learning_rate": 2.0397676496872207e-05, "loss": 1.0266, "num_input_tokens_seen": 1582752, "step": 2740 }, { "epoch": 0.40884718498659517, "grad_norm": 1.5210247039794922, "learning_rate": 2.043491212392017e-05, "loss": 0.9545, "num_input_tokens_seen": 1585728, "step": 2745 }, { "epoch": 0.40959189752755437, "grad_norm": 1.7735463380813599, "learning_rate": 2.0472147750968127e-05, "loss": 0.8963, "num_input_tokens_seen": 1588480, "step": 2750 }, { "epoch": 0.4103366100685136, "grad_norm": 2.172898292541504, "learning_rate": 2.0509383378016087e-05, "loss": 1.0176, "num_input_tokens_seen": 1591680, "step": 2755 }, { "epoch": 0.4110813226094727, "grad_norm": 1.3983640670776367, "learning_rate": 2.0546619005064047e-05, "loss": 0.8977, "num_input_tokens_seen": 1594496, "step": 2760 }, { "epoch": 0.4118260351504319, "grad_norm": 1.6170904636383057, "learning_rate": 2.0583854632112007e-05, "loss": 0.9837, "num_input_tokens_seen": 1597408, "step": 2765 }, { "epoch": 0.4125707476913911, "grad_norm": 1.5321210622787476, "learning_rate": 2.0621090259159963e-05, "loss": 0.8769, "num_input_tokens_seen": 1600288, "step": 2770 }, { "epoch": 0.41331546023235033, "grad_norm": 1.4698824882507324, "learning_rate": 2.0658325886207923e-05, "loss": 0.9918, "num_input_tokens_seen": 1603200, "step": 2775 }, { "epoch": 0.4140601727733095, "grad_norm": 1.2315069437026978, "learning_rate": 2.0695561513255883e-05, "loss": 0.8928, "num_input_tokens_seen": 1606144, "step": 2780 }, { "epoch": 0.4148048853142687, "grad_norm": 1.3463751077651978, "learning_rate": 2.0732797140303846e-05, "loss": 1.0166, "num_input_tokens_seen": 1609152, "step": 2785 }, { "epoch": 0.4155495978552279, "grad_norm": 1.6316088438034058, "learning_rate": 2.0770032767351803e-05, "loss": 0.9388, "num_input_tokens_seen": 1612192, "step": 2790 }, { "epoch": 0.4162943103961871, "grad_norm": 1.6106486320495605, "learning_rate": 2.0807268394399762e-05, "loss": 0.9303, "num_input_tokens_seen": 1614944, "step": 2795 }, { "epoch": 0.4170390229371463, "grad_norm": 1.4126830101013184, "learning_rate": 2.0844504021447722e-05, "loss": 0.9083, "num_input_tokens_seen": 1617856, "step": 2800 }, { "epoch": 0.41778373547810543, "grad_norm": 1.0450958013534546, "learning_rate": 2.0881739648495682e-05, "loss": 0.8857, "num_input_tokens_seen": 1620928, "step": 2805 }, { "epoch": 0.41852844801906464, "grad_norm": 1.5532305240631104, "learning_rate": 2.0918975275543642e-05, "loss": 0.8688, "num_input_tokens_seen": 1623744, "step": 2810 }, { "epoch": 0.41927316056002384, "grad_norm": 1.809333324432373, "learning_rate": 2.09562109025916e-05, "loss": 0.9922, "num_input_tokens_seen": 1626720, "step": 2815 }, { "epoch": 0.42001787310098304, "grad_norm": 2.226473331451416, "learning_rate": 2.099344652963956e-05, "loss": 1.0334, "num_input_tokens_seen": 1629696, "step": 2820 }, { "epoch": 0.4207625856419422, "grad_norm": 1.4110677242279053, "learning_rate": 2.103068215668752e-05, "loss": 0.9302, "num_input_tokens_seen": 1632416, "step": 2825 }, { "epoch": 0.4215072981829014, "grad_norm": 1.375343918800354, "learning_rate": 2.106791778373548e-05, "loss": 0.9387, "num_input_tokens_seen": 1635168, "step": 2830 }, { "epoch": 0.4222520107238606, "grad_norm": 1.4860739707946777, "learning_rate": 2.1105153410783438e-05, "loss": 0.9173, "num_input_tokens_seen": 1637760, "step": 2835 }, { "epoch": 0.4229967232648198, "grad_norm": 1.4669058322906494, "learning_rate": 2.1142389037831398e-05, "loss": 0.8967, "num_input_tokens_seen": 1640480, "step": 2840 }, { "epoch": 0.42374143580577894, "grad_norm": 1.5701823234558105, "learning_rate": 2.1179624664879358e-05, "loss": 0.9208, "num_input_tokens_seen": 1643232, "step": 2845 }, { "epoch": 0.42448614834673815, "grad_norm": 1.2535971403121948, "learning_rate": 2.1216860291927317e-05, "loss": 0.8441, "num_input_tokens_seen": 1646208, "step": 2850 }, { "epoch": 0.42523086088769735, "grad_norm": 1.2462310791015625, "learning_rate": 2.1254095918975274e-05, "loss": 0.8494, "num_input_tokens_seen": 1648864, "step": 2855 }, { "epoch": 0.42597557342865655, "grad_norm": 1.4163107872009277, "learning_rate": 2.1291331546023234e-05, "loss": 0.9329, "num_input_tokens_seen": 1651648, "step": 2860 }, { "epoch": 0.42672028596961575, "grad_norm": 1.6778515577316284, "learning_rate": 2.1328567173071197e-05, "loss": 0.9373, "num_input_tokens_seen": 1654272, "step": 2865 }, { "epoch": 0.4274649985105749, "grad_norm": 1.2672820091247559, "learning_rate": 2.1365802800119157e-05, "loss": 0.9117, "num_input_tokens_seen": 1657280, "step": 2870 }, { "epoch": 0.4282097110515341, "grad_norm": 1.2863231897354126, "learning_rate": 2.1403038427167117e-05, "loss": 0.9343, "num_input_tokens_seen": 1660000, "step": 2875 }, { "epoch": 0.4289544235924933, "grad_norm": 1.6903307437896729, "learning_rate": 2.1440274054215073e-05, "loss": 0.8804, "num_input_tokens_seen": 1662880, "step": 2880 }, { "epoch": 0.4296991361334525, "grad_norm": 1.5863945484161377, "learning_rate": 2.1477509681263033e-05, "loss": 0.9509, "num_input_tokens_seen": 1665760, "step": 2885 }, { "epoch": 0.43044384867441166, "grad_norm": 1.3959016799926758, "learning_rate": 2.1514745308310993e-05, "loss": 0.8759, "num_input_tokens_seen": 1668640, "step": 2890 }, { "epoch": 0.43118856121537086, "grad_norm": 1.2646760940551758, "learning_rate": 2.1551980935358953e-05, "loss": 0.9371, "num_input_tokens_seen": 1671552, "step": 2895 }, { "epoch": 0.43193327375633006, "grad_norm": 1.898356318473816, "learning_rate": 2.1589216562406913e-05, "loss": 1.0091, "num_input_tokens_seen": 1674432, "step": 2900 }, { "epoch": 0.43267798629728926, "grad_norm": 1.0471428632736206, "learning_rate": 2.1626452189454872e-05, "loss": 0.9214, "num_input_tokens_seen": 1677248, "step": 2905 }, { "epoch": 0.4334226988382484, "grad_norm": 1.6304844617843628, "learning_rate": 2.1663687816502832e-05, "loss": 0.8755, "num_input_tokens_seen": 1680064, "step": 2910 }, { "epoch": 0.4341674113792076, "grad_norm": 1.7434375286102295, "learning_rate": 2.1700923443550792e-05, "loss": 0.9215, "num_input_tokens_seen": 1682848, "step": 2915 }, { "epoch": 0.4349121239201668, "grad_norm": 2.3962161540985107, "learning_rate": 2.173815907059875e-05, "loss": 1.0842, "num_input_tokens_seen": 1685792, "step": 2920 }, { "epoch": 0.435656836461126, "grad_norm": 1.5946602821350098, "learning_rate": 2.177539469764671e-05, "loss": 0.8687, "num_input_tokens_seen": 1688448, "step": 2925 }, { "epoch": 0.4364015490020852, "grad_norm": 1.1290044784545898, "learning_rate": 2.1812630324694668e-05, "loss": 0.8498, "num_input_tokens_seen": 1691328, "step": 2930 }, { "epoch": 0.43714626154304437, "grad_norm": 1.5493556261062622, "learning_rate": 2.1849865951742628e-05, "loss": 0.8516, "num_input_tokens_seen": 1694336, "step": 2935 }, { "epoch": 0.43789097408400357, "grad_norm": 1.3050668239593506, "learning_rate": 2.1887101578790588e-05, "loss": 0.8492, "num_input_tokens_seen": 1697056, "step": 2940 }, { "epoch": 0.4386356866249628, "grad_norm": 1.3157371282577515, "learning_rate": 2.1924337205838548e-05, "loss": 0.876, "num_input_tokens_seen": 1699808, "step": 2945 }, { "epoch": 0.439380399165922, "grad_norm": 1.677422046661377, "learning_rate": 2.1961572832886508e-05, "loss": 0.8901, "num_input_tokens_seen": 1702848, "step": 2950 }, { "epoch": 0.4401251117068811, "grad_norm": 1.1218832731246948, "learning_rate": 2.1998808459934468e-05, "loss": 0.8885, "num_input_tokens_seen": 1705696, "step": 2955 }, { "epoch": 0.4408698242478403, "grad_norm": 1.7356301546096802, "learning_rate": 2.2036044086982427e-05, "loss": 0.9105, "num_input_tokens_seen": 1708576, "step": 2960 }, { "epoch": 0.4416145367887995, "grad_norm": 1.446213722229004, "learning_rate": 2.2073279714030384e-05, "loss": 0.9046, "num_input_tokens_seen": 1711328, "step": 2965 }, { "epoch": 0.44235924932975873, "grad_norm": 1.5545227527618408, "learning_rate": 2.2110515341078344e-05, "loss": 0.864, "num_input_tokens_seen": 1714368, "step": 2970 }, { "epoch": 0.4431039618707179, "grad_norm": 1.6478866338729858, "learning_rate": 2.2147750968126304e-05, "loss": 0.9364, "num_input_tokens_seen": 1717248, "step": 2975 }, { "epoch": 0.4438486744116771, "grad_norm": 1.6408898830413818, "learning_rate": 2.2184986595174263e-05, "loss": 0.8648, "num_input_tokens_seen": 1720384, "step": 2980 }, { "epoch": 0.4445933869526363, "grad_norm": 1.9865039587020874, "learning_rate": 2.2222222222222223e-05, "loss": 0.9884, "num_input_tokens_seen": 1723360, "step": 2985 }, { "epoch": 0.4453380994935955, "grad_norm": 1.5051794052124023, "learning_rate": 2.2259457849270183e-05, "loss": 0.8692, "num_input_tokens_seen": 1725920, "step": 2990 }, { "epoch": 0.4460828120345547, "grad_norm": 1.3004719018936157, "learning_rate": 2.2296693476318143e-05, "loss": 0.8266, "num_input_tokens_seen": 1728672, "step": 2995 }, { "epoch": 0.44682752457551383, "grad_norm": 1.2216365337371826, "learning_rate": 2.2333929103366103e-05, "loss": 0.874, "num_input_tokens_seen": 1731488, "step": 3000 }, { "epoch": 0.44757223711647304, "grad_norm": 1.587033748626709, "learning_rate": 2.2371164730414063e-05, "loss": 0.951, "num_input_tokens_seen": 1734496, "step": 3005 }, { "epoch": 0.44831694965743224, "grad_norm": 1.0187381505966187, "learning_rate": 2.240840035746202e-05, "loss": 0.9624, "num_input_tokens_seen": 1737408, "step": 3010 }, { "epoch": 0.44906166219839144, "grad_norm": 1.9185658693313599, "learning_rate": 2.244563598450998e-05, "loss": 0.8447, "num_input_tokens_seen": 1740096, "step": 3015 }, { "epoch": 0.4498063747393506, "grad_norm": 1.7120966911315918, "learning_rate": 2.248287161155794e-05, "loss": 0.8806, "num_input_tokens_seen": 1742688, "step": 3020 }, { "epoch": 0.4505510872803098, "grad_norm": 1.6759586334228516, "learning_rate": 2.25201072386059e-05, "loss": 1.0184, "num_input_tokens_seen": 1745760, "step": 3025 }, { "epoch": 0.451295799821269, "grad_norm": 1.5065349340438843, "learning_rate": 2.255734286565386e-05, "loss": 0.8739, "num_input_tokens_seen": 1748768, "step": 3030 }, { "epoch": 0.4520405123622282, "grad_norm": 1.5667169094085693, "learning_rate": 2.259457849270182e-05, "loss": 0.8264, "num_input_tokens_seen": 1751680, "step": 3035 }, { "epoch": 0.45278522490318734, "grad_norm": 1.2129313945770264, "learning_rate": 2.2631814119749778e-05, "loss": 0.9597, "num_input_tokens_seen": 1754496, "step": 3040 }, { "epoch": 0.45352993744414655, "grad_norm": 1.5411748886108398, "learning_rate": 2.2669049746797738e-05, "loss": 0.9229, "num_input_tokens_seen": 1757024, "step": 3045 }, { "epoch": 0.45427464998510575, "grad_norm": 1.2084896564483643, "learning_rate": 2.2706285373845695e-05, "loss": 0.8163, "num_input_tokens_seen": 1759936, "step": 3050 }, { "epoch": 0.45501936252606495, "grad_norm": 1.4446862936019897, "learning_rate": 2.2743521000893654e-05, "loss": 0.9347, "num_input_tokens_seen": 1762880, "step": 3055 }, { "epoch": 0.45576407506702415, "grad_norm": 1.502665638923645, "learning_rate": 2.2780756627941614e-05, "loss": 0.9079, "num_input_tokens_seen": 1765952, "step": 3060 }, { "epoch": 0.4565087876079833, "grad_norm": 1.2552956342697144, "learning_rate": 2.2817992254989574e-05, "loss": 0.8589, "num_input_tokens_seen": 1768704, "step": 3065 }, { "epoch": 0.4572535001489425, "grad_norm": 1.7765542268753052, "learning_rate": 2.2855227882037537e-05, "loss": 0.9496, "num_input_tokens_seen": 1771488, "step": 3070 }, { "epoch": 0.4579982126899017, "grad_norm": 1.365600347518921, "learning_rate": 2.2892463509085494e-05, "loss": 0.9487, "num_input_tokens_seen": 1774624, "step": 3075 }, { "epoch": 0.4587429252308609, "grad_norm": 1.4357402324676514, "learning_rate": 2.2929699136133454e-05, "loss": 0.9285, "num_input_tokens_seen": 1777728, "step": 3080 }, { "epoch": 0.45948763777182006, "grad_norm": 1.5256000757217407, "learning_rate": 2.2966934763181414e-05, "loss": 0.9062, "num_input_tokens_seen": 1780640, "step": 3085 }, { "epoch": 0.46023235031277926, "grad_norm": 1.182991862297058, "learning_rate": 2.3004170390229373e-05, "loss": 0.9113, "num_input_tokens_seen": 1783520, "step": 3090 }, { "epoch": 0.46097706285373846, "grad_norm": 1.2074847221374512, "learning_rate": 2.304140601727733e-05, "loss": 0.8491, "num_input_tokens_seen": 1786240, "step": 3095 }, { "epoch": 0.46172177539469766, "grad_norm": 1.4651601314544678, "learning_rate": 2.307864164432529e-05, "loss": 0.8968, "num_input_tokens_seen": 1789184, "step": 3100 }, { "epoch": 0.4624664879356568, "grad_norm": 1.194392204284668, "learning_rate": 2.3115877271373253e-05, "loss": 0.8764, "num_input_tokens_seen": 1791968, "step": 3105 }, { "epoch": 0.463211200476616, "grad_norm": 1.236756443977356, "learning_rate": 2.3153112898421213e-05, "loss": 0.9489, "num_input_tokens_seen": 1794816, "step": 3110 }, { "epoch": 0.4639559130175752, "grad_norm": 1.003218650817871, "learning_rate": 2.319034852546917e-05, "loss": 0.8631, "num_input_tokens_seen": 1797696, "step": 3115 }, { "epoch": 0.4647006255585344, "grad_norm": 1.364611268043518, "learning_rate": 2.322758415251713e-05, "loss": 0.908, "num_input_tokens_seen": 1800640, "step": 3120 }, { "epoch": 0.4654453380994936, "grad_norm": 1.4481889009475708, "learning_rate": 2.326481977956509e-05, "loss": 0.9253, "num_input_tokens_seen": 1803552, "step": 3125 }, { "epoch": 0.46619005064045277, "grad_norm": 1.0615166425704956, "learning_rate": 2.330205540661305e-05, "loss": 0.9104, "num_input_tokens_seen": 1806624, "step": 3130 }, { "epoch": 0.46693476318141197, "grad_norm": 2.4252569675445557, "learning_rate": 2.333929103366101e-05, "loss": 1.0078, "num_input_tokens_seen": 1809760, "step": 3135 }, { "epoch": 0.4676794757223712, "grad_norm": 1.3586591482162476, "learning_rate": 2.3376526660708965e-05, "loss": 0.916, "num_input_tokens_seen": 1812544, "step": 3140 }, { "epoch": 0.4684241882633304, "grad_norm": 1.0356488227844238, "learning_rate": 2.341376228775693e-05, "loss": 0.9594, "num_input_tokens_seen": 1815072, "step": 3145 }, { "epoch": 0.4691689008042895, "grad_norm": 1.1668678522109985, "learning_rate": 2.3450997914804888e-05, "loss": 0.8558, "num_input_tokens_seen": 1817760, "step": 3150 }, { "epoch": 0.4699136133452487, "grad_norm": 1.1961095333099365, "learning_rate": 2.3488233541852848e-05, "loss": 0.9122, "num_input_tokens_seen": 1820512, "step": 3155 }, { "epoch": 0.47065832588620793, "grad_norm": 1.1639193296432495, "learning_rate": 2.3525469168900805e-05, "loss": 0.8662, "num_input_tokens_seen": 1823808, "step": 3160 }, { "epoch": 0.47140303842716713, "grad_norm": 1.2363728284835815, "learning_rate": 2.3562704795948764e-05, "loss": 0.8245, "num_input_tokens_seen": 1826656, "step": 3165 }, { "epoch": 0.4721477509681263, "grad_norm": 2.1387686729431152, "learning_rate": 2.3599940422996724e-05, "loss": 1.1369, "num_input_tokens_seen": 1830304, "step": 3170 }, { "epoch": 0.4728924635090855, "grad_norm": 1.4269158840179443, "learning_rate": 2.3637176050044684e-05, "loss": 0.8969, "num_input_tokens_seen": 1833408, "step": 3175 }, { "epoch": 0.4736371760500447, "grad_norm": 1.4891337156295776, "learning_rate": 2.367441167709264e-05, "loss": 0.9046, "num_input_tokens_seen": 1836096, "step": 3180 }, { "epoch": 0.4743818885910039, "grad_norm": 1.3384512662887573, "learning_rate": 2.3711647304140604e-05, "loss": 0.8971, "num_input_tokens_seen": 1838912, "step": 3185 }, { "epoch": 0.4751266011319631, "grad_norm": 1.3052929639816284, "learning_rate": 2.3748882931188564e-05, "loss": 0.8147, "num_input_tokens_seen": 1841824, "step": 3190 }, { "epoch": 0.47587131367292224, "grad_norm": 1.0370960235595703, "learning_rate": 2.3786118558236524e-05, "loss": 0.8397, "num_input_tokens_seen": 1844800, "step": 3195 }, { "epoch": 0.47661602621388144, "grad_norm": 1.2659093141555786, "learning_rate": 2.3823354185284483e-05, "loss": 0.8796, "num_input_tokens_seen": 1847648, "step": 3200 }, { "epoch": 0.47736073875484064, "grad_norm": 1.4622634649276733, "learning_rate": 2.386058981233244e-05, "loss": 0.8229, "num_input_tokens_seen": 1850464, "step": 3205 }, { "epoch": 0.47810545129579984, "grad_norm": 1.7683565616607666, "learning_rate": 2.38978254393804e-05, "loss": 0.8775, "num_input_tokens_seen": 1853248, "step": 3210 }, { "epoch": 0.478850163836759, "grad_norm": 1.2906861305236816, "learning_rate": 2.393506106642836e-05, "loss": 0.9264, "num_input_tokens_seen": 1856000, "step": 3215 }, { "epoch": 0.4795948763777182, "grad_norm": 1.196792721748352, "learning_rate": 2.397229669347632e-05, "loss": 0.9691, "num_input_tokens_seen": 1858944, "step": 3220 }, { "epoch": 0.4803395889186774, "grad_norm": 1.2802107334136963, "learning_rate": 2.400953232052428e-05, "loss": 0.8846, "num_input_tokens_seen": 1862080, "step": 3225 }, { "epoch": 0.4810843014596366, "grad_norm": 1.6665648221969604, "learning_rate": 2.404676794757224e-05, "loss": 0.8378, "num_input_tokens_seen": 1865088, "step": 3230 }, { "epoch": 0.48182901400059575, "grad_norm": 1.0811558961868286, "learning_rate": 2.40840035746202e-05, "loss": 0.836, "num_input_tokens_seen": 1867552, "step": 3235 }, { "epoch": 0.48257372654155495, "grad_norm": 1.6322972774505615, "learning_rate": 2.412123920166816e-05, "loss": 0.9926, "num_input_tokens_seen": 1870784, "step": 3240 }, { "epoch": 0.48331843908251415, "grad_norm": 1.643203616142273, "learning_rate": 2.4158474828716115e-05, "loss": 0.9432, "num_input_tokens_seen": 1873920, "step": 3245 }, { "epoch": 0.48406315162347335, "grad_norm": 1.2899094820022583, "learning_rate": 2.4195710455764075e-05, "loss": 0.9447, "num_input_tokens_seen": 1877056, "step": 3250 }, { "epoch": 0.48480786416443256, "grad_norm": 1.4412879943847656, "learning_rate": 2.4232946082812035e-05, "loss": 0.8515, "num_input_tokens_seen": 1880160, "step": 3255 }, { "epoch": 0.4855525767053917, "grad_norm": 1.4714083671569824, "learning_rate": 2.4270181709859995e-05, "loss": 0.9054, "num_input_tokens_seen": 1882912, "step": 3260 }, { "epoch": 0.4862972892463509, "grad_norm": 1.548269271850586, "learning_rate": 2.4307417336907955e-05, "loss": 0.8694, "num_input_tokens_seen": 1885536, "step": 3265 }, { "epoch": 0.4870420017873101, "grad_norm": 1.5023950338363647, "learning_rate": 2.4344652963955915e-05, "loss": 0.8605, "num_input_tokens_seen": 1888416, "step": 3270 }, { "epoch": 0.4877867143282693, "grad_norm": 1.2326092720031738, "learning_rate": 2.4381888591003874e-05, "loss": 0.8608, "num_input_tokens_seen": 1891456, "step": 3275 }, { "epoch": 0.48853142686922846, "grad_norm": 1.1700459718704224, "learning_rate": 2.4419124218051834e-05, "loss": 0.8799, "num_input_tokens_seen": 1894240, "step": 3280 }, { "epoch": 0.48927613941018766, "grad_norm": 1.0232409238815308, "learning_rate": 2.4456359845099794e-05, "loss": 0.9275, "num_input_tokens_seen": 1897088, "step": 3285 }, { "epoch": 0.49002085195114686, "grad_norm": 1.0748567581176758, "learning_rate": 2.449359547214775e-05, "loss": 0.923, "num_input_tokens_seen": 1900192, "step": 3290 }, { "epoch": 0.49076556449210607, "grad_norm": 1.429492712020874, "learning_rate": 2.453083109919571e-05, "loss": 0.9361, "num_input_tokens_seen": 1903136, "step": 3295 }, { "epoch": 0.4915102770330652, "grad_norm": 1.348734736442566, "learning_rate": 2.456806672624367e-05, "loss": 0.8231, "num_input_tokens_seen": 1906016, "step": 3300 }, { "epoch": 0.4922549895740244, "grad_norm": 1.1905447244644165, "learning_rate": 2.460530235329163e-05, "loss": 0.8913, "num_input_tokens_seen": 1909184, "step": 3305 }, { "epoch": 0.4929997021149836, "grad_norm": 1.6242786645889282, "learning_rate": 2.464253798033959e-05, "loss": 0.881, "num_input_tokens_seen": 1911872, "step": 3310 }, { "epoch": 0.4937444146559428, "grad_norm": 1.0799806118011475, "learning_rate": 2.467977360738755e-05, "loss": 0.8157, "num_input_tokens_seen": 1914848, "step": 3315 }, { "epoch": 0.494489127196902, "grad_norm": 1.3518892526626587, "learning_rate": 2.471700923443551e-05, "loss": 0.9193, "num_input_tokens_seen": 1917920, "step": 3320 }, { "epoch": 0.49523383973786117, "grad_norm": 1.2260205745697021, "learning_rate": 2.475424486148347e-05, "loss": 0.9167, "num_input_tokens_seen": 1920704, "step": 3325 }, { "epoch": 0.4959785522788204, "grad_norm": 1.5572794675827026, "learning_rate": 2.4791480488531426e-05, "loss": 0.9475, "num_input_tokens_seen": 1923264, "step": 3330 }, { "epoch": 0.4967232648197796, "grad_norm": 2.041170120239258, "learning_rate": 2.4828716115579386e-05, "loss": 0.9509, "num_input_tokens_seen": 1926048, "step": 3335 }, { "epoch": 0.4974679773607388, "grad_norm": 1.6067720651626587, "learning_rate": 2.4865951742627346e-05, "loss": 0.8176, "num_input_tokens_seen": 1928896, "step": 3340 }, { "epoch": 0.4982126899016979, "grad_norm": 1.8538618087768555, "learning_rate": 2.4903187369675306e-05, "loss": 0.7899, "num_input_tokens_seen": 1931776, "step": 3345 }, { "epoch": 0.4989574024426571, "grad_norm": 1.136476993560791, "learning_rate": 2.494042299672327e-05, "loss": 0.8098, "num_input_tokens_seen": 1934624, "step": 3350 }, { "epoch": 0.49970211498361633, "grad_norm": 1.1941850185394287, "learning_rate": 2.4977658623771225e-05, "loss": 0.9385, "num_input_tokens_seen": 1937600, "step": 3355 }, { "epoch": 0.5, "eval_loss": 0.8721396923065186, "eval_runtime": 45.4565, "eval_samples_per_second": 65.645, "eval_steps_per_second": 16.411, "num_input_tokens_seen": 1938656, "step": 3357 }, { "epoch": 0.5004468275245755, "grad_norm": 1.3570953607559204, "learning_rate": 2.501489425081919e-05, "loss": 0.9287, "num_input_tokens_seen": 1940192, "step": 3360 }, { "epoch": 0.5011915400655347, "grad_norm": 1.1584827899932861, "learning_rate": 2.505212987786714e-05, "loss": 0.8744, "num_input_tokens_seen": 1942912, "step": 3365 }, { "epoch": 0.5019362526064939, "grad_norm": 1.4261876344680786, "learning_rate": 2.50893655049151e-05, "loss": 0.8678, "num_input_tokens_seen": 1945504, "step": 3370 }, { "epoch": 0.5026809651474531, "grad_norm": 1.7362483739852905, "learning_rate": 2.512660113196306e-05, "loss": 0.9114, "num_input_tokens_seen": 1948512, "step": 3375 }, { "epoch": 0.5034256776884123, "grad_norm": 1.1181539297103882, "learning_rate": 2.516383675901102e-05, "loss": 0.8378, "num_input_tokens_seen": 1951392, "step": 3380 }, { "epoch": 0.5041703902293715, "grad_norm": 1.280983328819275, "learning_rate": 2.520107238605898e-05, "loss": 0.7938, "num_input_tokens_seen": 1954272, "step": 3385 }, { "epoch": 0.5049151027703307, "grad_norm": 1.1653778553009033, "learning_rate": 2.5238308013106944e-05, "loss": 0.8266, "num_input_tokens_seen": 1957024, "step": 3390 }, { "epoch": 0.5056598153112899, "grad_norm": 1.351906657218933, "learning_rate": 2.5275543640154904e-05, "loss": 0.8699, "num_input_tokens_seen": 1959744, "step": 3395 }, { "epoch": 0.506404527852249, "grad_norm": 1.5046261548995972, "learning_rate": 2.5312779267202864e-05, "loss": 0.8901, "num_input_tokens_seen": 1962688, "step": 3400 }, { "epoch": 0.5071492403932082, "grad_norm": 1.2695038318634033, "learning_rate": 2.5350014894250824e-05, "loss": 0.9128, "num_input_tokens_seen": 1965440, "step": 3405 }, { "epoch": 0.5078939529341674, "grad_norm": 1.3487132787704468, "learning_rate": 2.5387250521298777e-05, "loss": 0.8343, "num_input_tokens_seen": 1968192, "step": 3410 }, { "epoch": 0.5086386654751266, "grad_norm": 1.357403039932251, "learning_rate": 2.5424486148346737e-05, "loss": 0.8633, "num_input_tokens_seen": 1970912, "step": 3415 }, { "epoch": 0.5093833780160858, "grad_norm": 1.0065279006958008, "learning_rate": 2.5461721775394697e-05, "loss": 0.861, "num_input_tokens_seen": 1973984, "step": 3420 }, { "epoch": 0.510128090557045, "grad_norm": 1.3133811950683594, "learning_rate": 2.5498957402442656e-05, "loss": 0.8398, "num_input_tokens_seen": 1976800, "step": 3425 }, { "epoch": 0.5108728030980042, "grad_norm": 1.5655205249786377, "learning_rate": 2.553619302949062e-05, "loss": 0.8739, "num_input_tokens_seen": 1979776, "step": 3430 }, { "epoch": 0.5116175156389634, "grad_norm": 1.7712197303771973, "learning_rate": 2.557342865653858e-05, "loss": 0.9292, "num_input_tokens_seen": 1982656, "step": 3435 }, { "epoch": 0.5123622281799225, "grad_norm": 1.4471874237060547, "learning_rate": 2.561066428358654e-05, "loss": 0.9074, "num_input_tokens_seen": 1985312, "step": 3440 }, { "epoch": 0.5131069407208817, "grad_norm": 1.5501629114151, "learning_rate": 2.56478999106345e-05, "loss": 0.8607, "num_input_tokens_seen": 1988384, "step": 3445 }, { "epoch": 0.5138516532618409, "grad_norm": 1.6877120733261108, "learning_rate": 2.5685135537682452e-05, "loss": 0.9011, "num_input_tokens_seen": 1991296, "step": 3450 }, { "epoch": 0.5145963658028001, "grad_norm": 0.9789985418319702, "learning_rate": 2.5722371164730412e-05, "loss": 0.8516, "num_input_tokens_seen": 1993952, "step": 3455 }, { "epoch": 0.5153410783437593, "grad_norm": 1.2764087915420532, "learning_rate": 2.5759606791778372e-05, "loss": 0.8798, "num_input_tokens_seen": 1997024, "step": 3460 }, { "epoch": 0.5160857908847185, "grad_norm": 1.063651204109192, "learning_rate": 2.5796842418826332e-05, "loss": 0.8919, "num_input_tokens_seen": 1999776, "step": 3465 }, { "epoch": 0.5168305034256777, "grad_norm": 1.1137369871139526, "learning_rate": 2.5834078045874295e-05, "loss": 0.9064, "num_input_tokens_seen": 2002752, "step": 3470 }, { "epoch": 0.5175752159666369, "grad_norm": 1.3718105554580688, "learning_rate": 2.5871313672922255e-05, "loss": 0.8586, "num_input_tokens_seen": 2005408, "step": 3475 }, { "epoch": 0.5183199285075961, "grad_norm": 0.9533346891403198, "learning_rate": 2.5908549299970215e-05, "loss": 0.8544, "num_input_tokens_seen": 2008320, "step": 3480 }, { "epoch": 0.5190646410485552, "grad_norm": 0.9622763395309448, "learning_rate": 2.5945784927018175e-05, "loss": 0.8342, "num_input_tokens_seen": 2011040, "step": 3485 }, { "epoch": 0.5198093535895144, "grad_norm": 1.351835012435913, "learning_rate": 2.5983020554066135e-05, "loss": 0.8862, "num_input_tokens_seen": 2013888, "step": 3490 }, { "epoch": 0.5205540661304736, "grad_norm": 1.5509014129638672, "learning_rate": 2.6020256181114088e-05, "loss": 0.8594, "num_input_tokens_seen": 2017024, "step": 3495 }, { "epoch": 0.5212987786714328, "grad_norm": 1.5787512063980103, "learning_rate": 2.6057491808162047e-05, "loss": 0.8508, "num_input_tokens_seen": 2019968, "step": 3500 }, { "epoch": 0.522043491212392, "grad_norm": 1.2607324123382568, "learning_rate": 2.609472743521001e-05, "loss": 0.8513, "num_input_tokens_seen": 2023104, "step": 3505 }, { "epoch": 0.5227882037533512, "grad_norm": 1.3870116472244263, "learning_rate": 2.613196306225797e-05, "loss": 0.8792, "num_input_tokens_seen": 2026016, "step": 3510 }, { "epoch": 0.5235329162943104, "grad_norm": 1.184601902961731, "learning_rate": 2.616919868930593e-05, "loss": 0.8351, "num_input_tokens_seen": 2028960, "step": 3515 }, { "epoch": 0.5242776288352696, "grad_norm": 1.5926939249038696, "learning_rate": 2.620643431635389e-05, "loss": 0.8783, "num_input_tokens_seen": 2032096, "step": 3520 }, { "epoch": 0.5250223413762288, "grad_norm": 1.2804481983184814, "learning_rate": 2.624366994340185e-05, "loss": 0.8246, "num_input_tokens_seen": 2034656, "step": 3525 }, { "epoch": 0.5257670539171879, "grad_norm": 1.4360971450805664, "learning_rate": 2.628090557044981e-05, "loss": 0.9087, "num_input_tokens_seen": 2037760, "step": 3530 }, { "epoch": 0.5265117664581471, "grad_norm": 1.577371597290039, "learning_rate": 2.631814119749777e-05, "loss": 0.9506, "num_input_tokens_seen": 2041248, "step": 3535 }, { "epoch": 0.5272564789991063, "grad_norm": 1.4475153684616089, "learning_rate": 2.6355376824545723e-05, "loss": 1.0042, "num_input_tokens_seen": 2044384, "step": 3540 }, { "epoch": 0.5280011915400655, "grad_norm": 1.139878749847412, "learning_rate": 2.6392612451593686e-05, "loss": 0.8644, "num_input_tokens_seen": 2046944, "step": 3545 }, { "epoch": 0.5287459040810247, "grad_norm": 1.5945777893066406, "learning_rate": 2.6429848078641646e-05, "loss": 0.8368, "num_input_tokens_seen": 2049792, "step": 3550 }, { "epoch": 0.5294906166219839, "grad_norm": 1.2384653091430664, "learning_rate": 2.6467083705689606e-05, "loss": 0.9062, "num_input_tokens_seen": 2052736, "step": 3555 }, { "epoch": 0.5302353291629431, "grad_norm": 1.6875205039978027, "learning_rate": 2.6504319332737566e-05, "loss": 0.8018, "num_input_tokens_seen": 2055616, "step": 3560 }, { "epoch": 0.5309800417039023, "grad_norm": 1.0436980724334717, "learning_rate": 2.6541554959785526e-05, "loss": 0.8407, "num_input_tokens_seen": 2058464, "step": 3565 }, { "epoch": 0.5317247542448614, "grad_norm": 1.3157029151916504, "learning_rate": 2.6578790586833485e-05, "loss": 0.8284, "num_input_tokens_seen": 2061152, "step": 3570 }, { "epoch": 0.5324694667858206, "grad_norm": 1.0634435415267944, "learning_rate": 2.6616026213881445e-05, "loss": 0.8121, "num_input_tokens_seen": 2063936, "step": 3575 }, { "epoch": 0.5332141793267798, "grad_norm": 1.275468111038208, "learning_rate": 2.66532618409294e-05, "loss": 0.8904, "num_input_tokens_seen": 2066688, "step": 3580 }, { "epoch": 0.533958891867739, "grad_norm": 1.3250608444213867, "learning_rate": 2.669049746797736e-05, "loss": 0.8483, "num_input_tokens_seen": 2069664, "step": 3585 }, { "epoch": 0.5347036044086982, "grad_norm": 1.3755810260772705, "learning_rate": 2.672773309502532e-05, "loss": 0.9702, "num_input_tokens_seen": 2072672, "step": 3590 }, { "epoch": 0.5354483169496574, "grad_norm": 1.1338242292404175, "learning_rate": 2.676496872207328e-05, "loss": 0.8873, "num_input_tokens_seen": 2075776, "step": 3595 }, { "epoch": 0.5361930294906166, "grad_norm": 1.2439855337142944, "learning_rate": 2.680220434912124e-05, "loss": 0.9446, "num_input_tokens_seen": 2078656, "step": 3600 }, { "epoch": 0.5369377420315758, "grad_norm": 2.2262182235717773, "learning_rate": 2.68394399761692e-05, "loss": 0.9972, "num_input_tokens_seen": 2081824, "step": 3605 }, { "epoch": 0.537682454572535, "grad_norm": 1.2610195875167847, "learning_rate": 2.687667560321716e-05, "loss": 0.8365, "num_input_tokens_seen": 2084672, "step": 3610 }, { "epoch": 0.5384271671134941, "grad_norm": 1.4022457599639893, "learning_rate": 2.691391123026512e-05, "loss": 0.8564, "num_input_tokens_seen": 2087296, "step": 3615 }, { "epoch": 0.5391718796544533, "grad_norm": 1.1158947944641113, "learning_rate": 2.695114685731308e-05, "loss": 0.8861, "num_input_tokens_seen": 2090272, "step": 3620 }, { "epoch": 0.5399165921954125, "grad_norm": 1.4824625253677368, "learning_rate": 2.6988382484361037e-05, "loss": 0.8764, "num_input_tokens_seen": 2093152, "step": 3625 }, { "epoch": 0.5406613047363718, "grad_norm": 1.4164752960205078, "learning_rate": 2.7025618111408997e-05, "loss": 0.9555, "num_input_tokens_seen": 2096160, "step": 3630 }, { "epoch": 0.541406017277331, "grad_norm": 1.532111406326294, "learning_rate": 2.7062853738456957e-05, "loss": 0.8608, "num_input_tokens_seen": 2098784, "step": 3635 }, { "epoch": 0.5421507298182902, "grad_norm": 1.4702656269073486, "learning_rate": 2.7100089365504917e-05, "loss": 0.8554, "num_input_tokens_seen": 2101440, "step": 3640 }, { "epoch": 0.5428954423592494, "grad_norm": 0.824699342250824, "learning_rate": 2.7137324992552876e-05, "loss": 0.8792, "num_input_tokens_seen": 2104448, "step": 3645 }, { "epoch": 0.5436401549002086, "grad_norm": 1.4313766956329346, "learning_rate": 2.7174560619600836e-05, "loss": 0.8658, "num_input_tokens_seen": 2107232, "step": 3650 }, { "epoch": 0.5443848674411678, "grad_norm": 1.1058796644210815, "learning_rate": 2.7211796246648796e-05, "loss": 0.8012, "num_input_tokens_seen": 2110336, "step": 3655 }, { "epoch": 0.5451295799821269, "grad_norm": 1.1222940683364868, "learning_rate": 2.7249031873696756e-05, "loss": 0.8844, "num_input_tokens_seen": 2113472, "step": 3660 }, { "epoch": 0.5458742925230861, "grad_norm": 2.4783928394317627, "learning_rate": 2.7286267500744716e-05, "loss": 1.0203, "num_input_tokens_seen": 2117216, "step": 3665 }, { "epoch": 0.5466190050640453, "grad_norm": 1.5889490842819214, "learning_rate": 2.7323503127792672e-05, "loss": 0.9244, "num_input_tokens_seen": 2120128, "step": 3670 }, { "epoch": 0.5473637176050045, "grad_norm": 1.5217089653015137, "learning_rate": 2.7360738754840632e-05, "loss": 0.9515, "num_input_tokens_seen": 2123328, "step": 3675 }, { "epoch": 0.5481084301459637, "grad_norm": 1.2653964757919312, "learning_rate": 2.7397974381888592e-05, "loss": 0.865, "num_input_tokens_seen": 2126528, "step": 3680 }, { "epoch": 0.5488531426869229, "grad_norm": 1.8818576335906982, "learning_rate": 2.7435210008936552e-05, "loss": 0.8936, "num_input_tokens_seen": 2129248, "step": 3685 }, { "epoch": 0.5495978552278821, "grad_norm": 1.2950255870819092, "learning_rate": 2.7472445635984512e-05, "loss": 0.8133, "num_input_tokens_seen": 2132192, "step": 3690 }, { "epoch": 0.5503425677688413, "grad_norm": 1.5853486061096191, "learning_rate": 2.750968126303247e-05, "loss": 0.8442, "num_input_tokens_seen": 2135104, "step": 3695 }, { "epoch": 0.5510872803098004, "grad_norm": 1.2517777681350708, "learning_rate": 2.754691689008043e-05, "loss": 0.8595, "num_input_tokens_seen": 2137824, "step": 3700 }, { "epoch": 0.5518319928507596, "grad_norm": 1.576731562614441, "learning_rate": 2.758415251712839e-05, "loss": 0.8837, "num_input_tokens_seen": 2140608, "step": 3705 }, { "epoch": 0.5525767053917188, "grad_norm": 1.2807211875915527, "learning_rate": 2.7621388144176348e-05, "loss": 0.8718, "num_input_tokens_seen": 2143392, "step": 3710 }, { "epoch": 0.553321417932678, "grad_norm": 1.2129799127578735, "learning_rate": 2.7658623771224308e-05, "loss": 0.8461, "num_input_tokens_seen": 2146432, "step": 3715 }, { "epoch": 0.5540661304736372, "grad_norm": 1.3164674043655396, "learning_rate": 2.7695859398272267e-05, "loss": 0.8929, "num_input_tokens_seen": 2149248, "step": 3720 }, { "epoch": 0.5548108430145964, "grad_norm": 1.302836298942566, "learning_rate": 2.7733095025320227e-05, "loss": 0.868, "num_input_tokens_seen": 2152352, "step": 3725 }, { "epoch": 0.5555555555555556, "grad_norm": 1.2546007633209229, "learning_rate": 2.7770330652368187e-05, "loss": 0.8848, "num_input_tokens_seen": 2155264, "step": 3730 }, { "epoch": 0.5563002680965148, "grad_norm": 1.2611249685287476, "learning_rate": 2.7807566279416147e-05, "loss": 0.8038, "num_input_tokens_seen": 2158336, "step": 3735 }, { "epoch": 0.557044980637474, "grad_norm": 1.1643311977386475, "learning_rate": 2.7844801906464107e-05, "loss": 0.815, "num_input_tokens_seen": 2161344, "step": 3740 }, { "epoch": 0.5577896931784331, "grad_norm": 1.3035588264465332, "learning_rate": 2.7882037533512067e-05, "loss": 0.8049, "num_input_tokens_seen": 2164032, "step": 3745 }, { "epoch": 0.5585344057193923, "grad_norm": 1.1948238611221313, "learning_rate": 2.7919273160560027e-05, "loss": 0.8987, "num_input_tokens_seen": 2166912, "step": 3750 }, { "epoch": 0.5592791182603515, "grad_norm": 1.0997251272201538, "learning_rate": 2.7956508787607983e-05, "loss": 0.8456, "num_input_tokens_seen": 2170016, "step": 3755 }, { "epoch": 0.5600238308013107, "grad_norm": 2.3392953872680664, "learning_rate": 2.7993744414655943e-05, "loss": 0.8326, "num_input_tokens_seen": 2172448, "step": 3760 }, { "epoch": 0.5607685433422699, "grad_norm": 1.6679168939590454, "learning_rate": 2.8030980041703903e-05, "loss": 0.8727, "num_input_tokens_seen": 2175776, "step": 3765 }, { "epoch": 0.5615132558832291, "grad_norm": 1.7812299728393555, "learning_rate": 2.8068215668751863e-05, "loss": 0.8506, "num_input_tokens_seen": 2178560, "step": 3770 }, { "epoch": 0.5622579684241883, "grad_norm": 1.3077250719070435, "learning_rate": 2.8105451295799822e-05, "loss": 0.867, "num_input_tokens_seen": 2181600, "step": 3775 }, { "epoch": 0.5630026809651475, "grad_norm": 1.4392935037612915, "learning_rate": 2.8142686922847782e-05, "loss": 0.7992, "num_input_tokens_seen": 2184448, "step": 3780 }, { "epoch": 0.5637473935061067, "grad_norm": 1.5183354616165161, "learning_rate": 2.8179922549895742e-05, "loss": 0.7756, "num_input_tokens_seen": 2187424, "step": 3785 }, { "epoch": 0.5644921060470658, "grad_norm": 0.9020177125930786, "learning_rate": 2.8217158176943702e-05, "loss": 0.8119, "num_input_tokens_seen": 2190080, "step": 3790 }, { "epoch": 0.565236818588025, "grad_norm": 1.3407392501831055, "learning_rate": 2.825439380399166e-05, "loss": 0.8549, "num_input_tokens_seen": 2192800, "step": 3795 }, { "epoch": 0.5659815311289842, "grad_norm": 1.155027985572815, "learning_rate": 2.8291629431039618e-05, "loss": 0.8344, "num_input_tokens_seen": 2195840, "step": 3800 }, { "epoch": 0.5667262436699434, "grad_norm": 1.300225019454956, "learning_rate": 2.8328865058087578e-05, "loss": 0.8703, "num_input_tokens_seen": 2199424, "step": 3805 }, { "epoch": 0.5674709562109026, "grad_norm": 1.6297986507415771, "learning_rate": 2.8366100685135538e-05, "loss": 0.9886, "num_input_tokens_seen": 2202208, "step": 3810 }, { "epoch": 0.5682156687518618, "grad_norm": 1.2879425287246704, "learning_rate": 2.8403336312183498e-05, "loss": 0.8146, "num_input_tokens_seen": 2205152, "step": 3815 }, { "epoch": 0.568960381292821, "grad_norm": 1.1714378595352173, "learning_rate": 2.8440571939231458e-05, "loss": 0.8943, "num_input_tokens_seen": 2208224, "step": 3820 }, { "epoch": 0.5697050938337802, "grad_norm": 1.139182448387146, "learning_rate": 2.8477807566279418e-05, "loss": 0.8644, "num_input_tokens_seen": 2211296, "step": 3825 }, { "epoch": 0.5704498063747393, "grad_norm": 1.3875011205673218, "learning_rate": 2.8515043193327377e-05, "loss": 0.8599, "num_input_tokens_seen": 2214176, "step": 3830 }, { "epoch": 0.5711945189156985, "grad_norm": 1.1674236059188843, "learning_rate": 2.8552278820375337e-05, "loss": 0.7978, "num_input_tokens_seen": 2216512, "step": 3835 }, { "epoch": 0.5719392314566577, "grad_norm": 1.2418150901794434, "learning_rate": 2.8589514447423294e-05, "loss": 0.8286, "num_input_tokens_seen": 2219392, "step": 3840 }, { "epoch": 0.5726839439976169, "grad_norm": 1.4826003313064575, "learning_rate": 2.8626750074471254e-05, "loss": 0.9257, "num_input_tokens_seen": 2222368, "step": 3845 }, { "epoch": 0.5734286565385761, "grad_norm": 1.1866264343261719, "learning_rate": 2.8663985701519213e-05, "loss": 0.8868, "num_input_tokens_seen": 2225472, "step": 3850 }, { "epoch": 0.5741733690795353, "grad_norm": 0.9983932375907898, "learning_rate": 2.8701221328567173e-05, "loss": 0.8602, "num_input_tokens_seen": 2228416, "step": 3855 }, { "epoch": 0.5749180816204945, "grad_norm": 1.4338866472244263, "learning_rate": 2.8738456955615133e-05, "loss": 0.8779, "num_input_tokens_seen": 2231104, "step": 3860 }, { "epoch": 0.5756627941614537, "grad_norm": 1.5278030633926392, "learning_rate": 2.8775692582663093e-05, "loss": 0.8545, "num_input_tokens_seen": 2233760, "step": 3865 }, { "epoch": 0.5764075067024129, "grad_norm": 1.098684549331665, "learning_rate": 2.8812928209711053e-05, "loss": 0.8165, "num_input_tokens_seen": 2236608, "step": 3870 }, { "epoch": 0.577152219243372, "grad_norm": 1.1295806169509888, "learning_rate": 2.8850163836759013e-05, "loss": 0.926, "num_input_tokens_seen": 2239424, "step": 3875 }, { "epoch": 0.5778969317843312, "grad_norm": 1.3902184963226318, "learning_rate": 2.8887399463806976e-05, "loss": 0.8662, "num_input_tokens_seen": 2242272, "step": 3880 }, { "epoch": 0.5786416443252904, "grad_norm": 0.9877424836158752, "learning_rate": 2.892463509085493e-05, "loss": 0.8679, "num_input_tokens_seen": 2245216, "step": 3885 }, { "epoch": 0.5793863568662496, "grad_norm": 0.9977008104324341, "learning_rate": 2.896187071790289e-05, "loss": 0.8764, "num_input_tokens_seen": 2248352, "step": 3890 }, { "epoch": 0.5801310694072088, "grad_norm": 0.8989164233207703, "learning_rate": 2.899910634495085e-05, "loss": 0.8374, "num_input_tokens_seen": 2251008, "step": 3895 }, { "epoch": 0.580875781948168, "grad_norm": 0.9736471772193909, "learning_rate": 2.903634197199881e-05, "loss": 0.8993, "num_input_tokens_seen": 2254048, "step": 3900 }, { "epoch": 0.5816204944891272, "grad_norm": 1.3832380771636963, "learning_rate": 2.907357759904677e-05, "loss": 0.8188, "num_input_tokens_seen": 2257088, "step": 3905 }, { "epoch": 0.5823652070300864, "grad_norm": 1.2008092403411865, "learning_rate": 2.9110813226094728e-05, "loss": 0.8217, "num_input_tokens_seen": 2260000, "step": 3910 }, { "epoch": 0.5831099195710456, "grad_norm": 1.2006351947784424, "learning_rate": 2.914804885314269e-05, "loss": 0.7876, "num_input_tokens_seen": 2262688, "step": 3915 }, { "epoch": 0.5838546321120047, "grad_norm": 0.7947505712509155, "learning_rate": 2.918528448019065e-05, "loss": 0.7612, "num_input_tokens_seen": 2265536, "step": 3920 }, { "epoch": 0.5845993446529639, "grad_norm": 1.3170045614242554, "learning_rate": 2.9222520107238604e-05, "loss": 0.9797, "num_input_tokens_seen": 2268512, "step": 3925 }, { "epoch": 0.5853440571939231, "grad_norm": 1.373175859451294, "learning_rate": 2.9259755734286564e-05, "loss": 0.8764, "num_input_tokens_seen": 2271712, "step": 3930 }, { "epoch": 0.5860887697348823, "grad_norm": 1.3405522108078003, "learning_rate": 2.9296991361334524e-05, "loss": 0.8496, "num_input_tokens_seen": 2274560, "step": 3935 }, { "epoch": 0.5868334822758415, "grad_norm": 1.2924672365188599, "learning_rate": 2.9334226988382484e-05, "loss": 0.8582, "num_input_tokens_seen": 2277376, "step": 3940 }, { "epoch": 0.5875781948168007, "grad_norm": 1.293419361114502, "learning_rate": 2.9371462615430444e-05, "loss": 0.8283, "num_input_tokens_seen": 2279840, "step": 3945 }, { "epoch": 0.5883229073577599, "grad_norm": 1.4196871519088745, "learning_rate": 2.9408698242478404e-05, "loss": 0.8734, "num_input_tokens_seen": 2282560, "step": 3950 }, { "epoch": 0.5890676198987191, "grad_norm": 1.5594643354415894, "learning_rate": 2.9445933869526367e-05, "loss": 0.8393, "num_input_tokens_seen": 2285536, "step": 3955 }, { "epoch": 0.5898123324396782, "grad_norm": 1.28119957447052, "learning_rate": 2.9483169496574327e-05, "loss": 0.8811, "num_input_tokens_seen": 2288288, "step": 3960 }, { "epoch": 0.5905570449806374, "grad_norm": 1.0750420093536377, "learning_rate": 2.9520405123622287e-05, "loss": 0.7286, "num_input_tokens_seen": 2291040, "step": 3965 }, { "epoch": 0.5913017575215966, "grad_norm": 1.0504299402236938, "learning_rate": 2.955764075067024e-05, "loss": 0.8446, "num_input_tokens_seen": 2293792, "step": 3970 }, { "epoch": 0.5920464700625558, "grad_norm": 1.3539741039276123, "learning_rate": 2.95948763777182e-05, "loss": 0.835, "num_input_tokens_seen": 2296800, "step": 3975 }, { "epoch": 0.592791182603515, "grad_norm": 1.7065528631210327, "learning_rate": 2.963211200476616e-05, "loss": 0.8071, "num_input_tokens_seen": 2299744, "step": 3980 }, { "epoch": 0.5935358951444742, "grad_norm": 1.2730776071548462, "learning_rate": 2.966934763181412e-05, "loss": 0.837, "num_input_tokens_seen": 2302784, "step": 3985 }, { "epoch": 0.5942806076854334, "grad_norm": 1.6041476726531982, "learning_rate": 2.970658325886208e-05, "loss": 0.9572, "num_input_tokens_seen": 2305632, "step": 3990 }, { "epoch": 0.5950253202263927, "grad_norm": 1.1580020189285278, "learning_rate": 2.9743818885910042e-05, "loss": 0.9318, "num_input_tokens_seen": 2308576, "step": 3995 }, { "epoch": 0.5957700327673519, "grad_norm": 1.5878199338912964, "learning_rate": 2.9781054512958002e-05, "loss": 0.7842, "num_input_tokens_seen": 2311552, "step": 4000 }, { "epoch": 0.596514745308311, "grad_norm": 1.0414018630981445, "learning_rate": 2.9818290140005962e-05, "loss": 0.8778, "num_input_tokens_seen": 2314400, "step": 4005 }, { "epoch": 0.5972594578492701, "grad_norm": 1.306864619255066, "learning_rate": 2.9855525767053922e-05, "loss": 0.8885, "num_input_tokens_seen": 2317184, "step": 4010 }, { "epoch": 0.5980041703902294, "grad_norm": 1.7423995733261108, "learning_rate": 2.9892761394101875e-05, "loss": 0.8158, "num_input_tokens_seen": 2320160, "step": 4015 }, { "epoch": 0.5987488829311886, "grad_norm": 1.2277711629867554, "learning_rate": 2.9929997021149835e-05, "loss": 0.832, "num_input_tokens_seen": 2322976, "step": 4020 }, { "epoch": 0.5994935954721478, "grad_norm": 1.43282151222229, "learning_rate": 2.9967232648197795e-05, "loss": 0.8861, "num_input_tokens_seen": 2325888, "step": 4025 }, { "epoch": 0.600238308013107, "grad_norm": 1.0155311822891235, "learning_rate": 3.0004468275245755e-05, "loss": 0.808, "num_input_tokens_seen": 2328672, "step": 4030 }, { "epoch": 0.6009830205540662, "grad_norm": 1.1654084920883179, "learning_rate": 3.0041703902293718e-05, "loss": 0.8235, "num_input_tokens_seen": 2331680, "step": 4035 }, { "epoch": 0.6017277330950254, "grad_norm": 1.1642718315124512, "learning_rate": 3.0078939529341678e-05, "loss": 0.8369, "num_input_tokens_seen": 2334272, "step": 4040 }, { "epoch": 0.6024724456359845, "grad_norm": 1.134674072265625, "learning_rate": 3.0116175156389638e-05, "loss": 0.7925, "num_input_tokens_seen": 2336928, "step": 4045 }, { "epoch": 0.6032171581769437, "grad_norm": 1.3158087730407715, "learning_rate": 3.0153410783437597e-05, "loss": 0.8827, "num_input_tokens_seen": 2340000, "step": 4050 }, { "epoch": 0.6039618707179029, "grad_norm": 0.8818653225898743, "learning_rate": 3.019064641048555e-05, "loss": 0.797, "num_input_tokens_seen": 2342784, "step": 4055 }, { "epoch": 0.6047065832588621, "grad_norm": 1.1831094026565552, "learning_rate": 3.022788203753351e-05, "loss": 0.8769, "num_input_tokens_seen": 2345504, "step": 4060 }, { "epoch": 0.6054512957998213, "grad_norm": 1.0604370832443237, "learning_rate": 3.026511766458147e-05, "loss": 0.86, "num_input_tokens_seen": 2348448, "step": 4065 }, { "epoch": 0.6061960083407805, "grad_norm": 1.232438087463379, "learning_rate": 3.030235329162943e-05, "loss": 0.7734, "num_input_tokens_seen": 2351264, "step": 4070 }, { "epoch": 0.6069407208817397, "grad_norm": 1.0556446313858032, "learning_rate": 3.0339588918677393e-05, "loss": 0.7883, "num_input_tokens_seen": 2353920, "step": 4075 }, { "epoch": 0.6076854334226989, "grad_norm": 1.4961296319961548, "learning_rate": 3.0376824545725353e-05, "loss": 0.7506, "num_input_tokens_seen": 2356608, "step": 4080 }, { "epoch": 0.6084301459636581, "grad_norm": 1.0464918613433838, "learning_rate": 3.0414060172773313e-05, "loss": 0.849, "num_input_tokens_seen": 2359392, "step": 4085 }, { "epoch": 0.6091748585046172, "grad_norm": 1.0878196954727173, "learning_rate": 3.0451295799821273e-05, "loss": 0.7997, "num_input_tokens_seen": 2362528, "step": 4090 }, { "epoch": 0.6099195710455764, "grad_norm": 1.141158103942871, "learning_rate": 3.0488531426869233e-05, "loss": 0.8438, "num_input_tokens_seen": 2365280, "step": 4095 }, { "epoch": 0.6106642835865356, "grad_norm": 1.0891181230545044, "learning_rate": 3.0525767053917186e-05, "loss": 0.8046, "num_input_tokens_seen": 2368096, "step": 4100 }, { "epoch": 0.6114089961274948, "grad_norm": 1.1371361017227173, "learning_rate": 3.056300268096515e-05, "loss": 0.838, "num_input_tokens_seen": 2370976, "step": 4105 }, { "epoch": 0.612153708668454, "grad_norm": 1.3579866886138916, "learning_rate": 3.0600238308013105e-05, "loss": 1.034, "num_input_tokens_seen": 2373888, "step": 4110 }, { "epoch": 0.6128984212094132, "grad_norm": 1.0786268711090088, "learning_rate": 3.063747393506107e-05, "loss": 0.9544, "num_input_tokens_seen": 2376704, "step": 4115 }, { "epoch": 0.6136431337503724, "grad_norm": 1.3782963752746582, "learning_rate": 3.0674709562109025e-05, "loss": 0.8739, "num_input_tokens_seen": 2379456, "step": 4120 }, { "epoch": 0.6143878462913316, "grad_norm": 1.5652260780334473, "learning_rate": 3.071194518915699e-05, "loss": 0.8193, "num_input_tokens_seen": 2382624, "step": 4125 }, { "epoch": 0.6151325588322908, "grad_norm": 1.4892367124557495, "learning_rate": 3.0749180816204945e-05, "loss": 0.8714, "num_input_tokens_seen": 2385472, "step": 4130 }, { "epoch": 0.6158772713732499, "grad_norm": 1.008299469947815, "learning_rate": 3.078641644325291e-05, "loss": 0.8859, "num_input_tokens_seen": 2388384, "step": 4135 }, { "epoch": 0.6166219839142091, "grad_norm": 1.1645631790161133, "learning_rate": 3.082365207030087e-05, "loss": 0.8909, "num_input_tokens_seen": 2391136, "step": 4140 }, { "epoch": 0.6173666964551683, "grad_norm": 1.3072032928466797, "learning_rate": 3.086088769734882e-05, "loss": 0.859, "num_input_tokens_seen": 2394144, "step": 4145 }, { "epoch": 0.6181114089961275, "grad_norm": 1.2572906017303467, "learning_rate": 3.0898123324396784e-05, "loss": 0.8683, "num_input_tokens_seen": 2396992, "step": 4150 }, { "epoch": 0.6188561215370867, "grad_norm": 1.2300736904144287, "learning_rate": 3.093535895144474e-05, "loss": 0.806, "num_input_tokens_seen": 2399616, "step": 4155 }, { "epoch": 0.6196008340780459, "grad_norm": 0.990428626537323, "learning_rate": 3.0972594578492704e-05, "loss": 0.8165, "num_input_tokens_seen": 2402272, "step": 4160 }, { "epoch": 0.6203455466190051, "grad_norm": 1.4776973724365234, "learning_rate": 3.100983020554066e-05, "loss": 0.8474, "num_input_tokens_seen": 2405120, "step": 4165 }, { "epoch": 0.6210902591599643, "grad_norm": 1.2011600732803345, "learning_rate": 3.1047065832588624e-05, "loss": 0.8853, "num_input_tokens_seen": 2408096, "step": 4170 }, { "epoch": 0.6218349717009234, "grad_norm": 1.4324615001678467, "learning_rate": 3.108430145963658e-05, "loss": 0.8629, "num_input_tokens_seen": 2411072, "step": 4175 }, { "epoch": 0.6225796842418826, "grad_norm": 0.9448227882385254, "learning_rate": 3.1121537086684543e-05, "loss": 0.8358, "num_input_tokens_seen": 2414080, "step": 4180 }, { "epoch": 0.6233243967828418, "grad_norm": 1.2656422853469849, "learning_rate": 3.11587727137325e-05, "loss": 0.7834, "num_input_tokens_seen": 2417280, "step": 4185 }, { "epoch": 0.624069109323801, "grad_norm": 0.9729986786842346, "learning_rate": 3.1196008340780456e-05, "loss": 0.7541, "num_input_tokens_seen": 2420128, "step": 4190 }, { "epoch": 0.6248138218647602, "grad_norm": 1.2937440872192383, "learning_rate": 3.123324396782842e-05, "loss": 0.8872, "num_input_tokens_seen": 2423136, "step": 4195 }, { "epoch": 0.6255585344057194, "grad_norm": 1.2910130023956299, "learning_rate": 3.1270479594876376e-05, "loss": 0.8617, "num_input_tokens_seen": 2426144, "step": 4200 }, { "epoch": 0.6263032469466786, "grad_norm": 1.010647177696228, "learning_rate": 3.130771522192434e-05, "loss": 0.8014, "num_input_tokens_seen": 2429280, "step": 4205 }, { "epoch": 0.6270479594876378, "grad_norm": 1.1485437154769897, "learning_rate": 3.1344950848972296e-05, "loss": 0.8266, "num_input_tokens_seen": 2432000, "step": 4210 }, { "epoch": 0.627792672028597, "grad_norm": 1.237249493598938, "learning_rate": 3.138218647602026e-05, "loss": 0.8387, "num_input_tokens_seen": 2435136, "step": 4215 }, { "epoch": 0.6285373845695561, "grad_norm": 0.9732128381729126, "learning_rate": 3.141942210306822e-05, "loss": 0.8471, "num_input_tokens_seen": 2437952, "step": 4220 }, { "epoch": 0.6292820971105153, "grad_norm": 1.5321831703186035, "learning_rate": 3.145665773011618e-05, "loss": 0.89, "num_input_tokens_seen": 2440928, "step": 4225 }, { "epoch": 0.6300268096514745, "grad_norm": 1.2295341491699219, "learning_rate": 3.1493893357164135e-05, "loss": 0.8199, "num_input_tokens_seen": 2443808, "step": 4230 }, { "epoch": 0.6307715221924337, "grad_norm": 1.2130138874053955, "learning_rate": 3.153112898421209e-05, "loss": 0.7991, "num_input_tokens_seen": 2446400, "step": 4235 }, { "epoch": 0.6315162347333929, "grad_norm": 0.8301532864570618, "learning_rate": 3.1568364611260055e-05, "loss": 0.824, "num_input_tokens_seen": 2449376, "step": 4240 }, { "epoch": 0.6322609472743521, "grad_norm": 1.1108092069625854, "learning_rate": 3.160560023830801e-05, "loss": 0.8325, "num_input_tokens_seen": 2452256, "step": 4245 }, { "epoch": 0.6330056598153113, "grad_norm": 1.1527138948440552, "learning_rate": 3.1642835865355975e-05, "loss": 0.8943, "num_input_tokens_seen": 2455040, "step": 4250 }, { "epoch": 0.6337503723562705, "grad_norm": 1.111138939857483, "learning_rate": 3.168007149240393e-05, "loss": 0.8878, "num_input_tokens_seen": 2457952, "step": 4255 }, { "epoch": 0.6344950848972297, "grad_norm": 1.0071576833724976, "learning_rate": 3.1717307119451894e-05, "loss": 0.8731, "num_input_tokens_seen": 2461216, "step": 4260 }, { "epoch": 0.6352397974381888, "grad_norm": 1.1536203622817993, "learning_rate": 3.175454274649986e-05, "loss": 0.8666, "num_input_tokens_seen": 2464544, "step": 4265 }, { "epoch": 0.635984509979148, "grad_norm": 1.1681625843048096, "learning_rate": 3.1791778373547814e-05, "loss": 0.783, "num_input_tokens_seen": 2467392, "step": 4270 }, { "epoch": 0.6367292225201072, "grad_norm": 1.5531505346298218, "learning_rate": 3.182901400059577e-05, "loss": 0.8823, "num_input_tokens_seen": 2470112, "step": 4275 }, { "epoch": 0.6374739350610664, "grad_norm": 1.439759373664856, "learning_rate": 3.186624962764373e-05, "loss": 0.8022, "num_input_tokens_seen": 2472992, "step": 4280 }, { "epoch": 0.6382186476020256, "grad_norm": 1.4405643939971924, "learning_rate": 3.190348525469169e-05, "loss": 0.8888, "num_input_tokens_seen": 2475808, "step": 4285 }, { "epoch": 0.6389633601429848, "grad_norm": 1.4085025787353516, "learning_rate": 3.1940720881739647e-05, "loss": 0.8263, "num_input_tokens_seen": 2478688, "step": 4290 }, { "epoch": 0.639708072683944, "grad_norm": 1.137587308883667, "learning_rate": 3.197795650878761e-05, "loss": 0.8176, "num_input_tokens_seen": 2481344, "step": 4295 }, { "epoch": 0.6404527852249032, "grad_norm": 1.0666371583938599, "learning_rate": 3.201519213583557e-05, "loss": 0.7942, "num_input_tokens_seen": 2484064, "step": 4300 }, { "epoch": 0.6411974977658623, "grad_norm": 1.2214452028274536, "learning_rate": 3.205242776288353e-05, "loss": 0.8369, "num_input_tokens_seen": 2486784, "step": 4305 }, { "epoch": 0.6419422103068215, "grad_norm": 1.604526162147522, "learning_rate": 3.208966338993149e-05, "loss": 0.8286, "num_input_tokens_seen": 2489568, "step": 4310 }, { "epoch": 0.6426869228477807, "grad_norm": 1.3072311878204346, "learning_rate": 3.212689901697944e-05, "loss": 0.8386, "num_input_tokens_seen": 2492512, "step": 4315 }, { "epoch": 0.6434316353887399, "grad_norm": 1.266474962234497, "learning_rate": 3.2164134644027406e-05, "loss": 0.8428, "num_input_tokens_seen": 2495296, "step": 4320 }, { "epoch": 0.6441763479296991, "grad_norm": 1.0957741737365723, "learning_rate": 3.220137027107536e-05, "loss": 0.807, "num_input_tokens_seen": 2498208, "step": 4325 }, { "epoch": 0.6449210604706583, "grad_norm": 1.0534197092056274, "learning_rate": 3.2238605898123325e-05, "loss": 0.7807, "num_input_tokens_seen": 2500800, "step": 4330 }, { "epoch": 0.6456657730116175, "grad_norm": 1.0315253734588623, "learning_rate": 3.227584152517129e-05, "loss": 0.8563, "num_input_tokens_seen": 2503488, "step": 4335 }, { "epoch": 0.6464104855525767, "grad_norm": 1.0263296365737915, "learning_rate": 3.2313077152219245e-05, "loss": 0.8274, "num_input_tokens_seen": 2506272, "step": 4340 }, { "epoch": 0.6471551980935359, "grad_norm": 1.1720918416976929, "learning_rate": 3.235031277926721e-05, "loss": 0.8645, "num_input_tokens_seen": 2508992, "step": 4345 }, { "epoch": 0.647899910634495, "grad_norm": 1.1980715990066528, "learning_rate": 3.2387548406315165e-05, "loss": 0.8551, "num_input_tokens_seen": 2511488, "step": 4350 }, { "epoch": 0.6486446231754542, "grad_norm": 1.022804617881775, "learning_rate": 3.242478403336313e-05, "loss": 0.8306, "num_input_tokens_seen": 2514432, "step": 4355 }, { "epoch": 0.6493893357164134, "grad_norm": 1.3221698999404907, "learning_rate": 3.246201966041108e-05, "loss": 0.9292, "num_input_tokens_seen": 2517664, "step": 4360 }, { "epoch": 0.6501340482573726, "grad_norm": 1.2065120935440063, "learning_rate": 3.249925528745904e-05, "loss": 0.8577, "num_input_tokens_seen": 2520608, "step": 4365 }, { "epoch": 0.6508787607983318, "grad_norm": 1.7113722562789917, "learning_rate": 3.2536490914507e-05, "loss": 0.9815, "num_input_tokens_seen": 2523520, "step": 4370 }, { "epoch": 0.651623473339291, "grad_norm": 1.3473142385482788, "learning_rate": 3.257372654155496e-05, "loss": 0.8331, "num_input_tokens_seen": 2526592, "step": 4375 }, { "epoch": 0.6523681858802503, "grad_norm": 1.4884377717971802, "learning_rate": 3.2610962168602924e-05, "loss": 0.8512, "num_input_tokens_seen": 2529728, "step": 4380 }, { "epoch": 0.6531128984212095, "grad_norm": 1.1478090286254883, "learning_rate": 3.264819779565088e-05, "loss": 0.8114, "num_input_tokens_seen": 2532544, "step": 4385 }, { "epoch": 0.6538576109621687, "grad_norm": 0.8285272717475891, "learning_rate": 3.2685433422698844e-05, "loss": 0.8503, "num_input_tokens_seen": 2535328, "step": 4390 }, { "epoch": 0.6546023235031277, "grad_norm": 1.2943884134292603, "learning_rate": 3.27226690497468e-05, "loss": 0.8094, "num_input_tokens_seen": 2538240, "step": 4395 }, { "epoch": 0.655347036044087, "grad_norm": 1.274190902709961, "learning_rate": 3.275990467679476e-05, "loss": 0.8065, "num_input_tokens_seen": 2540960, "step": 4400 }, { "epoch": 0.6560917485850462, "grad_norm": 1.2734653949737549, "learning_rate": 3.279714030384271e-05, "loss": 0.8821, "num_input_tokens_seen": 2543776, "step": 4405 }, { "epoch": 0.6568364611260054, "grad_norm": 1.000451683998108, "learning_rate": 3.2834375930890676e-05, "loss": 0.8701, "num_input_tokens_seen": 2546624, "step": 4410 }, { "epoch": 0.6575811736669646, "grad_norm": 0.8162661194801331, "learning_rate": 3.287161155793864e-05, "loss": 0.7844, "num_input_tokens_seen": 2549408, "step": 4415 }, { "epoch": 0.6583258862079238, "grad_norm": 1.7672449350357056, "learning_rate": 3.2908847184986596e-05, "loss": 0.8542, "num_input_tokens_seen": 2552384, "step": 4420 }, { "epoch": 0.659070598748883, "grad_norm": 1.0223807096481323, "learning_rate": 3.294608281203456e-05, "loss": 0.8739, "num_input_tokens_seen": 2555200, "step": 4425 }, { "epoch": 0.6598153112898422, "grad_norm": 1.1338704824447632, "learning_rate": 3.2983318439082516e-05, "loss": 0.8142, "num_input_tokens_seen": 2558176, "step": 4430 }, { "epoch": 0.6605600238308013, "grad_norm": 1.341470718383789, "learning_rate": 3.302055406613048e-05, "loss": 0.8612, "num_input_tokens_seen": 2561248, "step": 4435 }, { "epoch": 0.6613047363717605, "grad_norm": 1.1810137033462524, "learning_rate": 3.3057789693178435e-05, "loss": 0.938, "num_input_tokens_seen": 2564384, "step": 4440 }, { "epoch": 0.6620494489127197, "grad_norm": 1.8690370321273804, "learning_rate": 3.309502532022639e-05, "loss": 0.885, "num_input_tokens_seen": 2567008, "step": 4445 }, { "epoch": 0.6627941614536789, "grad_norm": 0.7773482203483582, "learning_rate": 3.313226094727435e-05, "loss": 0.7917, "num_input_tokens_seen": 2569952, "step": 4450 }, { "epoch": 0.6635388739946381, "grad_norm": 0.997076690196991, "learning_rate": 3.316949657432231e-05, "loss": 0.8773, "num_input_tokens_seen": 2572832, "step": 4455 }, { "epoch": 0.6642835865355973, "grad_norm": 1.1312904357910156, "learning_rate": 3.3206732201370275e-05, "loss": 0.855, "num_input_tokens_seen": 2575584, "step": 4460 }, { "epoch": 0.6650282990765565, "grad_norm": 1.2019689083099365, "learning_rate": 3.324396782841823e-05, "loss": 0.9319, "num_input_tokens_seen": 2578656, "step": 4465 }, { "epoch": 0.6657730116175157, "grad_norm": 0.8720852732658386, "learning_rate": 3.3281203455466195e-05, "loss": 0.8497, "num_input_tokens_seen": 2581472, "step": 4470 }, { "epoch": 0.6665177241584749, "grad_norm": 1.1475082635879517, "learning_rate": 3.331843908251415e-05, "loss": 0.8537, "num_input_tokens_seen": 2584384, "step": 4475 }, { "epoch": 0.667262436699434, "grad_norm": 1.159995198249817, "learning_rate": 3.3355674709562114e-05, "loss": 0.7995, "num_input_tokens_seen": 2587136, "step": 4480 }, { "epoch": 0.6680071492403932, "grad_norm": 0.8705872297286987, "learning_rate": 3.339291033661007e-05, "loss": 0.8776, "num_input_tokens_seen": 2590176, "step": 4485 }, { "epoch": 0.6687518617813524, "grad_norm": 1.058168649673462, "learning_rate": 3.343014596365803e-05, "loss": 0.8685, "num_input_tokens_seen": 2592896, "step": 4490 }, { "epoch": 0.6694965743223116, "grad_norm": 1.1424826383590698, "learning_rate": 3.346738159070599e-05, "loss": 0.8278, "num_input_tokens_seen": 2595616, "step": 4495 }, { "epoch": 0.6702412868632708, "grad_norm": 1.1940572261810303, "learning_rate": 3.350461721775395e-05, "loss": 0.8473, "num_input_tokens_seen": 2598528, "step": 4500 }, { "epoch": 0.67098599940423, "grad_norm": 1.0980908870697021, "learning_rate": 3.354185284480191e-05, "loss": 0.84, "num_input_tokens_seen": 2601440, "step": 4505 }, { "epoch": 0.6717307119451892, "grad_norm": 1.1905385255813599, "learning_rate": 3.3579088471849867e-05, "loss": 0.9109, "num_input_tokens_seen": 2604480, "step": 4510 }, { "epoch": 0.6724754244861484, "grad_norm": 0.8794968724250793, "learning_rate": 3.361632409889783e-05, "loss": 0.8762, "num_input_tokens_seen": 2607456, "step": 4515 }, { "epoch": 0.6732201370271076, "grad_norm": 0.9762709736824036, "learning_rate": 3.3653559725945786e-05, "loss": 0.8217, "num_input_tokens_seen": 2610304, "step": 4520 }, { "epoch": 0.6739648495680667, "grad_norm": 1.0313130617141724, "learning_rate": 3.369079535299375e-05, "loss": 0.877, "num_input_tokens_seen": 2612960, "step": 4525 }, { "epoch": 0.6747095621090259, "grad_norm": 0.7211444973945618, "learning_rate": 3.3728030980041706e-05, "loss": 0.8071, "num_input_tokens_seen": 2615424, "step": 4530 }, { "epoch": 0.6754542746499851, "grad_norm": 1.094813346862793, "learning_rate": 3.376526660708966e-05, "loss": 0.8734, "num_input_tokens_seen": 2618496, "step": 4535 }, { "epoch": 0.6761989871909443, "grad_norm": 1.3401778936386108, "learning_rate": 3.3802502234137626e-05, "loss": 0.8134, "num_input_tokens_seen": 2621280, "step": 4540 }, { "epoch": 0.6769436997319035, "grad_norm": 1.0345200300216675, "learning_rate": 3.383973786118558e-05, "loss": 0.793, "num_input_tokens_seen": 2624000, "step": 4545 }, { "epoch": 0.6776884122728627, "grad_norm": 0.8572093844413757, "learning_rate": 3.3876973488233545e-05, "loss": 0.829, "num_input_tokens_seen": 2626976, "step": 4550 }, { "epoch": 0.6784331248138219, "grad_norm": 1.086379885673523, "learning_rate": 3.39142091152815e-05, "loss": 0.8552, "num_input_tokens_seen": 2629824, "step": 4555 }, { "epoch": 0.6791778373547811, "grad_norm": 0.8036803603172302, "learning_rate": 3.3951444742329465e-05, "loss": 0.7879, "num_input_tokens_seen": 2632608, "step": 4560 }, { "epoch": 0.6799225498957402, "grad_norm": 0.9777621626853943, "learning_rate": 3.398868036937742e-05, "loss": 0.808, "num_input_tokens_seen": 2635616, "step": 4565 }, { "epoch": 0.6806672624366994, "grad_norm": 0.8652511239051819, "learning_rate": 3.4025915996425385e-05, "loss": 0.8263, "num_input_tokens_seen": 2638848, "step": 4570 }, { "epoch": 0.6814119749776586, "grad_norm": 1.045183539390564, "learning_rate": 3.406315162347334e-05, "loss": 0.8483, "num_input_tokens_seen": 2641664, "step": 4575 }, { "epoch": 0.6821566875186178, "grad_norm": 1.4335217475891113, "learning_rate": 3.41003872505213e-05, "loss": 0.8063, "num_input_tokens_seen": 2644864, "step": 4580 }, { "epoch": 0.682901400059577, "grad_norm": 0.9631665945053101, "learning_rate": 3.413762287756926e-05, "loss": 0.8048, "num_input_tokens_seen": 2647840, "step": 4585 }, { "epoch": 0.6836461126005362, "grad_norm": 0.8861815929412842, "learning_rate": 3.417485850461722e-05, "loss": 0.8079, "num_input_tokens_seen": 2650848, "step": 4590 }, { "epoch": 0.6843908251414954, "grad_norm": 1.2538609504699707, "learning_rate": 3.421209413166518e-05, "loss": 0.8336, "num_input_tokens_seen": 2653536, "step": 4595 }, { "epoch": 0.6851355376824546, "grad_norm": 1.3796552419662476, "learning_rate": 3.424932975871314e-05, "loss": 0.7977, "num_input_tokens_seen": 2656576, "step": 4600 }, { "epoch": 0.6858802502234138, "grad_norm": 1.284213900566101, "learning_rate": 3.42865653857611e-05, "loss": 0.9269, "num_input_tokens_seen": 2659776, "step": 4605 }, { "epoch": 0.6866249627643729, "grad_norm": 0.9998615980148315, "learning_rate": 3.432380101280906e-05, "loss": 0.7735, "num_input_tokens_seen": 2662720, "step": 4610 }, { "epoch": 0.6873696753053321, "grad_norm": 1.2473645210266113, "learning_rate": 3.436103663985702e-05, "loss": 0.826, "num_input_tokens_seen": 2665440, "step": 4615 }, { "epoch": 0.6881143878462913, "grad_norm": 1.2293469905853271, "learning_rate": 3.4398272266904977e-05, "loss": 0.8375, "num_input_tokens_seen": 2668224, "step": 4620 }, { "epoch": 0.6888591003872505, "grad_norm": 1.1415026187896729, "learning_rate": 3.443550789395293e-05, "loss": 0.8139, "num_input_tokens_seen": 2671040, "step": 4625 }, { "epoch": 0.6896038129282097, "grad_norm": 1.523759365081787, "learning_rate": 3.4472743521000896e-05, "loss": 0.9026, "num_input_tokens_seen": 2673792, "step": 4630 }, { "epoch": 0.6903485254691689, "grad_norm": 1.0459692478179932, "learning_rate": 3.450997914804885e-05, "loss": 0.8444, "num_input_tokens_seen": 2676672, "step": 4635 }, { "epoch": 0.6910932380101281, "grad_norm": 1.004744291305542, "learning_rate": 3.4547214775096816e-05, "loss": 0.8283, "num_input_tokens_seen": 2679424, "step": 4640 }, { "epoch": 0.6918379505510873, "grad_norm": 1.8462529182434082, "learning_rate": 3.458445040214477e-05, "loss": 0.8011, "num_input_tokens_seen": 2682240, "step": 4645 }, { "epoch": 0.6925826630920465, "grad_norm": 1.2615535259246826, "learning_rate": 3.4621686029192736e-05, "loss": 0.8598, "num_input_tokens_seen": 2685376, "step": 4650 }, { "epoch": 0.6933273756330056, "grad_norm": 0.967686116695404, "learning_rate": 3.465892165624069e-05, "loss": 0.8298, "num_input_tokens_seen": 2687968, "step": 4655 }, { "epoch": 0.6940720881739648, "grad_norm": 1.1081655025482178, "learning_rate": 3.4696157283288655e-05, "loss": 0.7971, "num_input_tokens_seen": 2690624, "step": 4660 }, { "epoch": 0.694816800714924, "grad_norm": 1.0656981468200684, "learning_rate": 3.473339291033661e-05, "loss": 0.8284, "num_input_tokens_seen": 2693664, "step": 4665 }, { "epoch": 0.6955615132558832, "grad_norm": 1.1145304441452026, "learning_rate": 3.477062853738457e-05, "loss": 0.8783, "num_input_tokens_seen": 2696576, "step": 4670 }, { "epoch": 0.6963062257968424, "grad_norm": 1.2717957496643066, "learning_rate": 3.480786416443253e-05, "loss": 0.9302, "num_input_tokens_seen": 2699488, "step": 4675 }, { "epoch": 0.6970509383378016, "grad_norm": 0.961633563041687, "learning_rate": 3.484509979148049e-05, "loss": 0.8312, "num_input_tokens_seen": 2702336, "step": 4680 }, { "epoch": 0.6977956508787608, "grad_norm": 1.2145353555679321, "learning_rate": 3.488233541852845e-05, "loss": 0.8353, "num_input_tokens_seen": 2705152, "step": 4685 }, { "epoch": 0.69854036341972, "grad_norm": 0.9414501786231995, "learning_rate": 3.491957104557641e-05, "loss": 0.8096, "num_input_tokens_seen": 2708096, "step": 4690 }, { "epoch": 0.6992850759606791, "grad_norm": 1.3493582010269165, "learning_rate": 3.495680667262437e-05, "loss": 0.8656, "num_input_tokens_seen": 2710944, "step": 4695 }, { "epoch": 0.7000297885016383, "grad_norm": 1.0855669975280762, "learning_rate": 3.499404229967233e-05, "loss": 0.8342, "num_input_tokens_seen": 2713664, "step": 4700 }, { "epoch": 0.7007745010425975, "grad_norm": 1.492720603942871, "learning_rate": 3.5031277926720284e-05, "loss": 0.8999, "num_input_tokens_seen": 2716896, "step": 4705 }, { "epoch": 0.7015192135835567, "grad_norm": 1.2404175996780396, "learning_rate": 3.506851355376825e-05, "loss": 0.8234, "num_input_tokens_seen": 2719584, "step": 4710 }, { "epoch": 0.7022639261245159, "grad_norm": 1.0065934658050537, "learning_rate": 3.5105749180816204e-05, "loss": 0.7711, "num_input_tokens_seen": 2722304, "step": 4715 }, { "epoch": 0.7030086386654751, "grad_norm": 1.2405004501342773, "learning_rate": 3.514298480786417e-05, "loss": 0.8016, "num_input_tokens_seen": 2724992, "step": 4720 }, { "epoch": 0.7037533512064343, "grad_norm": 1.2088960409164429, "learning_rate": 3.518022043491212e-05, "loss": 0.8555, "num_input_tokens_seen": 2727648, "step": 4725 }, { "epoch": 0.7044980637473935, "grad_norm": 0.9338083267211914, "learning_rate": 3.5217456061960087e-05, "loss": 0.8824, "num_input_tokens_seen": 2730560, "step": 4730 }, { "epoch": 0.7052427762883527, "grad_norm": 1.0120664834976196, "learning_rate": 3.525469168900804e-05, "loss": 0.7942, "num_input_tokens_seen": 2733440, "step": 4735 }, { "epoch": 0.7059874888293118, "grad_norm": 1.9204916954040527, "learning_rate": 3.5291927316056006e-05, "loss": 0.8139, "num_input_tokens_seen": 2736320, "step": 4740 }, { "epoch": 0.706732201370271, "grad_norm": 1.3198020458221436, "learning_rate": 3.532916294310397e-05, "loss": 0.8669, "num_input_tokens_seen": 2739136, "step": 4745 }, { "epoch": 0.7074769139112302, "grad_norm": 1.3473374843597412, "learning_rate": 3.536639857015192e-05, "loss": 0.8415, "num_input_tokens_seen": 2742176, "step": 4750 }, { "epoch": 0.7082216264521894, "grad_norm": 0.9602341055870056, "learning_rate": 3.540363419719988e-05, "loss": 0.8476, "num_input_tokens_seen": 2745056, "step": 4755 }, { "epoch": 0.7089663389931486, "grad_norm": 0.9817070960998535, "learning_rate": 3.544086982424784e-05, "loss": 0.8154, "num_input_tokens_seen": 2748128, "step": 4760 }, { "epoch": 0.7097110515341079, "grad_norm": 1.3652021884918213, "learning_rate": 3.54781054512958e-05, "loss": 0.8186, "num_input_tokens_seen": 2751008, "step": 4765 }, { "epoch": 0.710455764075067, "grad_norm": 1.1924904584884644, "learning_rate": 3.551534107834376e-05, "loss": 0.833, "num_input_tokens_seen": 2754016, "step": 4770 }, { "epoch": 0.7112004766160263, "grad_norm": 1.1821420192718506, "learning_rate": 3.555257670539172e-05, "loss": 0.7953, "num_input_tokens_seen": 2757248, "step": 4775 }, { "epoch": 0.7119451891569855, "grad_norm": 1.0027382373809814, "learning_rate": 3.558981233243968e-05, "loss": 0.8469, "num_input_tokens_seen": 2760128, "step": 4780 }, { "epoch": 0.7126899016979446, "grad_norm": 1.0492500066757202, "learning_rate": 3.562704795948764e-05, "loss": 0.7976, "num_input_tokens_seen": 2763136, "step": 4785 }, { "epoch": 0.7134346142389038, "grad_norm": 0.7963113188743591, "learning_rate": 3.56642835865356e-05, "loss": 0.8459, "num_input_tokens_seen": 2765856, "step": 4790 }, { "epoch": 0.714179326779863, "grad_norm": 1.3516002893447876, "learning_rate": 3.5701519213583554e-05, "loss": 0.7915, "num_input_tokens_seen": 2768864, "step": 4795 }, { "epoch": 0.7149240393208222, "grad_norm": 1.216902494430542, "learning_rate": 3.573875484063152e-05, "loss": 0.82, "num_input_tokens_seen": 2771520, "step": 4800 }, { "epoch": 0.7156687518617814, "grad_norm": 0.9343279004096985, "learning_rate": 3.5775990467679474e-05, "loss": 0.8145, "num_input_tokens_seen": 2774560, "step": 4805 }, { "epoch": 0.7164134644027406, "grad_norm": 1.2513139247894287, "learning_rate": 3.581322609472744e-05, "loss": 0.8383, "num_input_tokens_seen": 2777312, "step": 4810 }, { "epoch": 0.7171581769436998, "grad_norm": 1.625225305557251, "learning_rate": 3.5850461721775394e-05, "loss": 0.8126, "num_input_tokens_seen": 2780128, "step": 4815 }, { "epoch": 0.717902889484659, "grad_norm": 1.0224688053131104, "learning_rate": 3.588769734882336e-05, "loss": 0.8712, "num_input_tokens_seen": 2783200, "step": 4820 }, { "epoch": 0.7186476020256181, "grad_norm": 1.5378873348236084, "learning_rate": 3.592493297587132e-05, "loss": 0.876, "num_input_tokens_seen": 2786048, "step": 4825 }, { "epoch": 0.7193923145665773, "grad_norm": 0.9782545566558838, "learning_rate": 3.596216860291928e-05, "loss": 0.8005, "num_input_tokens_seen": 2788864, "step": 4830 }, { "epoch": 0.7201370271075365, "grad_norm": 1.2315266132354736, "learning_rate": 3.599940422996723e-05, "loss": 0.9116, "num_input_tokens_seen": 2791552, "step": 4835 }, { "epoch": 0.7208817396484957, "grad_norm": 0.7786136865615845, "learning_rate": 3.603663985701519e-05, "loss": 0.8703, "num_input_tokens_seen": 2794528, "step": 4840 }, { "epoch": 0.7216264521894549, "grad_norm": 1.0597201585769653, "learning_rate": 3.607387548406315e-05, "loss": 0.8358, "num_input_tokens_seen": 2797280, "step": 4845 }, { "epoch": 0.7223711647304141, "grad_norm": 0.9696630835533142, "learning_rate": 3.611111111111111e-05, "loss": 0.8359, "num_input_tokens_seen": 2799872, "step": 4850 }, { "epoch": 0.7231158772713733, "grad_norm": 0.893944501876831, "learning_rate": 3.614834673815907e-05, "loss": 0.7986, "num_input_tokens_seen": 2802560, "step": 4855 }, { "epoch": 0.7238605898123325, "grad_norm": 1.2188513278961182, "learning_rate": 3.618558236520703e-05, "loss": 0.8136, "num_input_tokens_seen": 2805536, "step": 4860 }, { "epoch": 0.7246053023532917, "grad_norm": 1.152150273323059, "learning_rate": 3.622281799225499e-05, "loss": 0.8491, "num_input_tokens_seen": 2808576, "step": 4865 }, { "epoch": 0.7253500148942508, "grad_norm": 1.068213939666748, "learning_rate": 3.6260053619302956e-05, "loss": 0.8172, "num_input_tokens_seen": 2811680, "step": 4870 }, { "epoch": 0.72609472743521, "grad_norm": 1.2289010286331177, "learning_rate": 3.629728924635091e-05, "loss": 0.8376, "num_input_tokens_seen": 2814528, "step": 4875 }, { "epoch": 0.7268394399761692, "grad_norm": 1.380643606185913, "learning_rate": 3.633452487339887e-05, "loss": 0.8615, "num_input_tokens_seen": 2817056, "step": 4880 }, { "epoch": 0.7275841525171284, "grad_norm": 1.0037530660629272, "learning_rate": 3.6371760500446825e-05, "loss": 0.8392, "num_input_tokens_seen": 2820032, "step": 4885 }, { "epoch": 0.7283288650580876, "grad_norm": 1.1168553829193115, "learning_rate": 3.640899612749479e-05, "loss": 0.7473, "num_input_tokens_seen": 2822752, "step": 4890 }, { "epoch": 0.7290735775990468, "grad_norm": 0.9229227304458618, "learning_rate": 3.6446231754542745e-05, "loss": 0.8504, "num_input_tokens_seen": 2825568, "step": 4895 }, { "epoch": 0.729818290140006, "grad_norm": 0.9209307432174683, "learning_rate": 3.648346738159071e-05, "loss": 0.8586, "num_input_tokens_seen": 2828576, "step": 4900 }, { "epoch": 0.7305630026809652, "grad_norm": 0.9376413822174072, "learning_rate": 3.652070300863867e-05, "loss": 0.8312, "num_input_tokens_seen": 2831360, "step": 4905 }, { "epoch": 0.7313077152219244, "grad_norm": 1.1135636568069458, "learning_rate": 3.655793863568663e-05, "loss": 0.8026, "num_input_tokens_seen": 2833952, "step": 4910 }, { "epoch": 0.7320524277628835, "grad_norm": 1.2312172651290894, "learning_rate": 3.659517426273459e-05, "loss": 0.78, "num_input_tokens_seen": 2837280, "step": 4915 }, { "epoch": 0.7327971403038427, "grad_norm": 0.8439638018608093, "learning_rate": 3.663240988978254e-05, "loss": 0.7906, "num_input_tokens_seen": 2840000, "step": 4920 }, { "epoch": 0.7335418528448019, "grad_norm": 1.007658839225769, "learning_rate": 3.6669645516830504e-05, "loss": 0.8297, "num_input_tokens_seen": 2842912, "step": 4925 }, { "epoch": 0.7342865653857611, "grad_norm": 1.0834404230117798, "learning_rate": 3.670688114387846e-05, "loss": 0.9145, "num_input_tokens_seen": 2845952, "step": 4930 }, { "epoch": 0.7350312779267203, "grad_norm": 0.9513331651687622, "learning_rate": 3.6744116770926424e-05, "loss": 0.8224, "num_input_tokens_seen": 2848736, "step": 4935 }, { "epoch": 0.7357759904676795, "grad_norm": 1.3799458742141724, "learning_rate": 3.678135239797439e-05, "loss": 0.8421, "num_input_tokens_seen": 2851616, "step": 4940 }, { "epoch": 0.7365207030086387, "grad_norm": 1.213901400566101, "learning_rate": 3.681858802502234e-05, "loss": 0.893, "num_input_tokens_seen": 2854368, "step": 4945 }, { "epoch": 0.7372654155495979, "grad_norm": 0.9189006686210632, "learning_rate": 3.6855823652070307e-05, "loss": 0.7954, "num_input_tokens_seen": 2857312, "step": 4950 }, { "epoch": 0.738010128090557, "grad_norm": 0.8028642535209656, "learning_rate": 3.689305927911826e-05, "loss": 0.8337, "num_input_tokens_seen": 2860320, "step": 4955 }, { "epoch": 0.7387548406315162, "grad_norm": 0.8312283158302307, "learning_rate": 3.6930294906166226e-05, "loss": 0.8513, "num_input_tokens_seen": 2863072, "step": 4960 }, { "epoch": 0.7394995531724754, "grad_norm": 0.8848573565483093, "learning_rate": 3.6967530533214176e-05, "loss": 0.8702, "num_input_tokens_seen": 2866112, "step": 4965 }, { "epoch": 0.7402442657134346, "grad_norm": 1.0260246992111206, "learning_rate": 3.700476616026214e-05, "loss": 0.8114, "num_input_tokens_seen": 2868960, "step": 4970 }, { "epoch": 0.7409889782543938, "grad_norm": 1.3906184434890747, "learning_rate": 3.7042001787310096e-05, "loss": 0.8409, "num_input_tokens_seen": 2871680, "step": 4975 }, { "epoch": 0.741733690795353, "grad_norm": 0.8660810589790344, "learning_rate": 3.707923741435806e-05, "loss": 0.8726, "num_input_tokens_seen": 2874496, "step": 4980 }, { "epoch": 0.7424784033363122, "grad_norm": 1.28335440158844, "learning_rate": 3.711647304140602e-05, "loss": 0.8542, "num_input_tokens_seen": 2877536, "step": 4985 }, { "epoch": 0.7432231158772714, "grad_norm": 0.944863498210907, "learning_rate": 3.715370866845398e-05, "loss": 0.7696, "num_input_tokens_seen": 2880480, "step": 4990 }, { "epoch": 0.7439678284182306, "grad_norm": 0.856024444103241, "learning_rate": 3.719094429550194e-05, "loss": 0.8109, "num_input_tokens_seen": 2883296, "step": 4995 }, { "epoch": 0.7447125409591897, "grad_norm": 1.1930752992630005, "learning_rate": 3.72281799225499e-05, "loss": 0.8703, "num_input_tokens_seen": 2885824, "step": 5000 }, { "epoch": 0.7454572535001489, "grad_norm": 0.9162873029708862, "learning_rate": 3.726541554959786e-05, "loss": 0.8005, "num_input_tokens_seen": 2888448, "step": 5005 }, { "epoch": 0.7462019660411081, "grad_norm": 1.5417958498001099, "learning_rate": 3.730265117664581e-05, "loss": 0.8564, "num_input_tokens_seen": 2891104, "step": 5010 }, { "epoch": 0.7469466785820673, "grad_norm": 0.9303978681564331, "learning_rate": 3.7339886803693774e-05, "loss": 0.8264, "num_input_tokens_seen": 2893888, "step": 5015 }, { "epoch": 0.7476913911230265, "grad_norm": 1.1245824098587036, "learning_rate": 3.737712243074174e-05, "loss": 0.8307, "num_input_tokens_seen": 2896192, "step": 5020 }, { "epoch": 0.7484361036639857, "grad_norm": 1.1027028560638428, "learning_rate": 3.7414358057789694e-05, "loss": 0.801, "num_input_tokens_seen": 2899008, "step": 5025 }, { "epoch": 0.7491808162049449, "grad_norm": 1.0556631088256836, "learning_rate": 3.745159368483766e-05, "loss": 0.8535, "num_input_tokens_seen": 2901920, "step": 5030 }, { "epoch": 0.7499255287459041, "grad_norm": 1.3408770561218262, "learning_rate": 3.7488829311885614e-05, "loss": 0.8719, "num_input_tokens_seen": 2905184, "step": 5035 }, { "epoch": 0.7506702412868632, "grad_norm": 1.1955475807189941, "learning_rate": 3.752606493893358e-05, "loss": 0.8236, "num_input_tokens_seen": 2908000, "step": 5040 }, { "epoch": 0.7514149538278224, "grad_norm": 0.8407461643218994, "learning_rate": 3.7563300565981534e-05, "loss": 0.7856, "num_input_tokens_seen": 2910720, "step": 5045 }, { "epoch": 0.7521596663687816, "grad_norm": 1.05118989944458, "learning_rate": 3.760053619302949e-05, "loss": 0.8222, "num_input_tokens_seen": 2913728, "step": 5050 }, { "epoch": 0.7529043789097408, "grad_norm": 1.0496995449066162, "learning_rate": 3.7637771820077446e-05, "loss": 0.8059, "num_input_tokens_seen": 2916352, "step": 5055 }, { "epoch": 0.7536490914507, "grad_norm": 1.2455360889434814, "learning_rate": 3.767500744712541e-05, "loss": 0.7956, "num_input_tokens_seen": 2919168, "step": 5060 }, { "epoch": 0.7543938039916592, "grad_norm": 1.120810866355896, "learning_rate": 3.771224307417337e-05, "loss": 0.8077, "num_input_tokens_seen": 2921696, "step": 5065 }, { "epoch": 0.7551385165326184, "grad_norm": 0.9055120944976807, "learning_rate": 3.774947870122133e-05, "loss": 0.8178, "num_input_tokens_seen": 2924544, "step": 5070 }, { "epoch": 0.7558832290735776, "grad_norm": 1.1176071166992188, "learning_rate": 3.778671432826929e-05, "loss": 0.9025, "num_input_tokens_seen": 2927520, "step": 5075 }, { "epoch": 0.7566279416145368, "grad_norm": 1.0526490211486816, "learning_rate": 3.782394995531725e-05, "loss": 0.8836, "num_input_tokens_seen": 2930560, "step": 5080 }, { "epoch": 0.7573726541554959, "grad_norm": 0.923673152923584, "learning_rate": 3.786118558236521e-05, "loss": 0.8182, "num_input_tokens_seen": 2933088, "step": 5085 }, { "epoch": 0.7581173666964551, "grad_norm": 1.006963849067688, "learning_rate": 3.789842120941317e-05, "loss": 0.8471, "num_input_tokens_seen": 2935808, "step": 5090 }, { "epoch": 0.7588620792374143, "grad_norm": 0.9522362351417542, "learning_rate": 3.7935656836461125e-05, "loss": 0.8613, "num_input_tokens_seen": 2938624, "step": 5095 }, { "epoch": 0.7596067917783735, "grad_norm": 1.402834415435791, "learning_rate": 3.797289246350909e-05, "loss": 0.8022, "num_input_tokens_seen": 2941600, "step": 5100 }, { "epoch": 0.7603515043193327, "grad_norm": 1.3122442960739136, "learning_rate": 3.8010128090557045e-05, "loss": 0.8491, "num_input_tokens_seen": 2944576, "step": 5105 }, { "epoch": 0.7610962168602919, "grad_norm": 0.6766989231109619, "learning_rate": 3.804736371760501e-05, "loss": 0.7866, "num_input_tokens_seen": 2947424, "step": 5110 }, { "epoch": 0.7618409294012511, "grad_norm": 1.4426848888397217, "learning_rate": 3.8084599344652965e-05, "loss": 0.8188, "num_input_tokens_seen": 2950336, "step": 5115 }, { "epoch": 0.7625856419422103, "grad_norm": 1.0070401430130005, "learning_rate": 3.812183497170093e-05, "loss": 0.8577, "num_input_tokens_seen": 2952992, "step": 5120 }, { "epoch": 0.7633303544831695, "grad_norm": 0.9388384222984314, "learning_rate": 3.8159070598748884e-05, "loss": 0.8525, "num_input_tokens_seen": 2955776, "step": 5125 }, { "epoch": 0.7640750670241286, "grad_norm": 1.1150439977645874, "learning_rate": 3.819630622579685e-05, "loss": 0.8629, "num_input_tokens_seen": 2958592, "step": 5130 }, { "epoch": 0.7648197795650878, "grad_norm": 1.2852503061294556, "learning_rate": 3.8233541852844804e-05, "loss": 0.7921, "num_input_tokens_seen": 2961408, "step": 5135 }, { "epoch": 0.765564492106047, "grad_norm": 1.4140664339065552, "learning_rate": 3.827077747989276e-05, "loss": 0.9147, "num_input_tokens_seen": 2964064, "step": 5140 }, { "epoch": 0.7663092046470062, "grad_norm": 1.3116527795791626, "learning_rate": 3.8308013106940724e-05, "loss": 0.9008, "num_input_tokens_seen": 2966560, "step": 5145 }, { "epoch": 0.7670539171879655, "grad_norm": 0.7471579313278198, "learning_rate": 3.834524873398868e-05, "loss": 0.8394, "num_input_tokens_seen": 2969344, "step": 5150 }, { "epoch": 0.7677986297289247, "grad_norm": 0.9229001402854919, "learning_rate": 3.8382484361036644e-05, "loss": 0.8234, "num_input_tokens_seen": 2972224, "step": 5155 }, { "epoch": 0.7685433422698839, "grad_norm": 1.1669433116912842, "learning_rate": 3.84197199880846e-05, "loss": 0.8585, "num_input_tokens_seen": 2975296, "step": 5160 }, { "epoch": 0.7692880548108431, "grad_norm": 1.2700726985931396, "learning_rate": 3.845695561513256e-05, "loss": 0.8379, "num_input_tokens_seen": 2978144, "step": 5165 }, { "epoch": 0.7700327673518021, "grad_norm": 0.8497989773750305, "learning_rate": 3.849419124218052e-05, "loss": 0.8331, "num_input_tokens_seen": 2980864, "step": 5170 }, { "epoch": 0.7707774798927614, "grad_norm": 1.155573844909668, "learning_rate": 3.853142686922848e-05, "loss": 0.7518, "num_input_tokens_seen": 2983488, "step": 5175 }, { "epoch": 0.7715221924337206, "grad_norm": 1.0571961402893066, "learning_rate": 3.856866249627644e-05, "loss": 0.8269, "num_input_tokens_seen": 2986304, "step": 5180 }, { "epoch": 0.7722669049746798, "grad_norm": 1.172871470451355, "learning_rate": 3.8605898123324396e-05, "loss": 0.7538, "num_input_tokens_seen": 2989120, "step": 5185 }, { "epoch": 0.773011617515639, "grad_norm": 0.8336185216903687, "learning_rate": 3.864313375037236e-05, "loss": 0.754, "num_input_tokens_seen": 2991904, "step": 5190 }, { "epoch": 0.7737563300565982, "grad_norm": 0.7791119813919067, "learning_rate": 3.8680369377420316e-05, "loss": 0.8299, "num_input_tokens_seen": 2994624, "step": 5195 }, { "epoch": 0.7745010425975574, "grad_norm": 1.1129884719848633, "learning_rate": 3.871760500446828e-05, "loss": 0.8482, "num_input_tokens_seen": 2997568, "step": 5200 }, { "epoch": 0.7752457551385166, "grad_norm": 1.5813069343566895, "learning_rate": 3.8754840631516235e-05, "loss": 0.9406, "num_input_tokens_seen": 3000448, "step": 5205 }, { "epoch": 0.7759904676794758, "grad_norm": 0.8137978911399841, "learning_rate": 3.87920762585642e-05, "loss": 0.7986, "num_input_tokens_seen": 3003392, "step": 5210 }, { "epoch": 0.7767351802204349, "grad_norm": 1.1251413822174072, "learning_rate": 3.8829311885612155e-05, "loss": 0.8289, "num_input_tokens_seen": 3006080, "step": 5215 }, { "epoch": 0.7774798927613941, "grad_norm": 0.9063993096351624, "learning_rate": 3.886654751266012e-05, "loss": 0.8102, "num_input_tokens_seen": 3008992, "step": 5220 }, { "epoch": 0.7782246053023533, "grad_norm": 0.9539239406585693, "learning_rate": 3.8903783139708075e-05, "loss": 0.8221, "num_input_tokens_seen": 3011552, "step": 5225 }, { "epoch": 0.7789693178433125, "grad_norm": 1.2172143459320068, "learning_rate": 3.894101876675603e-05, "loss": 0.8082, "num_input_tokens_seen": 3014400, "step": 5230 }, { "epoch": 0.7797140303842717, "grad_norm": 1.2311514616012573, "learning_rate": 3.8978254393803994e-05, "loss": 0.8206, "num_input_tokens_seen": 3017280, "step": 5235 }, { "epoch": 0.7804587429252309, "grad_norm": 1.1120673418045044, "learning_rate": 3.901549002085195e-05, "loss": 0.8264, "num_input_tokens_seen": 3020064, "step": 5240 }, { "epoch": 0.7812034554661901, "grad_norm": 0.8912447094917297, "learning_rate": 3.9052725647899914e-05, "loss": 0.8723, "num_input_tokens_seen": 3023072, "step": 5245 }, { "epoch": 0.7819481680071493, "grad_norm": 1.229750156402588, "learning_rate": 3.908996127494787e-05, "loss": 0.8412, "num_input_tokens_seen": 3025856, "step": 5250 }, { "epoch": 0.7826928805481085, "grad_norm": 1.2690989971160889, "learning_rate": 3.9127196901995834e-05, "loss": 0.8212, "num_input_tokens_seen": 3029152, "step": 5255 }, { "epoch": 0.7834375930890676, "grad_norm": 0.9537530541419983, "learning_rate": 3.916443252904379e-05, "loss": 0.8143, "num_input_tokens_seen": 3031904, "step": 5260 }, { "epoch": 0.7841823056300268, "grad_norm": 0.8439931273460388, "learning_rate": 3.9201668156091754e-05, "loss": 0.8606, "num_input_tokens_seen": 3034848, "step": 5265 }, { "epoch": 0.784927018170986, "grad_norm": 1.2547098398208618, "learning_rate": 3.923890378313971e-05, "loss": 0.8633, "num_input_tokens_seen": 3037888, "step": 5270 }, { "epoch": 0.7856717307119452, "grad_norm": 0.9714174270629883, "learning_rate": 3.9276139410187666e-05, "loss": 0.8155, "num_input_tokens_seen": 3040896, "step": 5275 }, { "epoch": 0.7864164432529044, "grad_norm": 0.824704110622406, "learning_rate": 3.931337503723563e-05, "loss": 0.8218, "num_input_tokens_seen": 3043840, "step": 5280 }, { "epoch": 0.7871611557938636, "grad_norm": 0.9173425436019897, "learning_rate": 3.9350610664283586e-05, "loss": 0.8192, "num_input_tokens_seen": 3046624, "step": 5285 }, { "epoch": 0.7879058683348228, "grad_norm": 1.0982238054275513, "learning_rate": 3.938784629133155e-05, "loss": 0.8414, "num_input_tokens_seen": 3049472, "step": 5290 }, { "epoch": 0.788650580875782, "grad_norm": 0.7739755511283875, "learning_rate": 3.9425081918379506e-05, "loss": 0.8393, "num_input_tokens_seen": 3052416, "step": 5295 }, { "epoch": 0.7893952934167411, "grad_norm": 0.7681728601455688, "learning_rate": 3.946231754542747e-05, "loss": 0.8196, "num_input_tokens_seen": 3055232, "step": 5300 }, { "epoch": 0.7901400059577003, "grad_norm": 0.9613179564476013, "learning_rate": 3.9499553172475426e-05, "loss": 0.8039, "num_input_tokens_seen": 3058272, "step": 5305 }, { "epoch": 0.7908847184986595, "grad_norm": 0.7741140127182007, "learning_rate": 3.953678879952338e-05, "loss": 0.7532, "num_input_tokens_seen": 3061280, "step": 5310 }, { "epoch": 0.7916294310396187, "grad_norm": 1.1309881210327148, "learning_rate": 3.9574024426571345e-05, "loss": 0.8224, "num_input_tokens_seen": 3064320, "step": 5315 }, { "epoch": 0.7923741435805779, "grad_norm": 1.10210382938385, "learning_rate": 3.96112600536193e-05, "loss": 0.8622, "num_input_tokens_seen": 3066912, "step": 5320 }, { "epoch": 0.7931188561215371, "grad_norm": 0.9730312824249268, "learning_rate": 3.9648495680667265e-05, "loss": 0.8171, "num_input_tokens_seen": 3069856, "step": 5325 }, { "epoch": 0.7938635686624963, "grad_norm": 1.023444652557373, "learning_rate": 3.968573130771522e-05, "loss": 0.8873, "num_input_tokens_seen": 3072672, "step": 5330 }, { "epoch": 0.7946082812034555, "grad_norm": 0.9425243735313416, "learning_rate": 3.9722966934763185e-05, "loss": 0.8618, "num_input_tokens_seen": 3075776, "step": 5335 }, { "epoch": 0.7953529937444147, "grad_norm": 0.9694623947143555, "learning_rate": 3.976020256181114e-05, "loss": 0.7639, "num_input_tokens_seen": 3078624, "step": 5340 }, { "epoch": 0.7960977062853738, "grad_norm": 0.8754270076751709, "learning_rate": 3.9797438188859104e-05, "loss": 0.8517, "num_input_tokens_seen": 3081728, "step": 5345 }, { "epoch": 0.796842418826333, "grad_norm": 1.4500941038131714, "learning_rate": 3.983467381590707e-05, "loss": 0.8329, "num_input_tokens_seen": 3084512, "step": 5350 }, { "epoch": 0.7975871313672922, "grad_norm": 1.0634108781814575, "learning_rate": 3.987190944295502e-05, "loss": 0.8981, "num_input_tokens_seen": 3087488, "step": 5355 }, { "epoch": 0.7983318439082514, "grad_norm": 0.8592182993888855, "learning_rate": 3.990914507000298e-05, "loss": 0.8435, "num_input_tokens_seen": 3089984, "step": 5360 }, { "epoch": 0.7990765564492106, "grad_norm": 1.0373088121414185, "learning_rate": 3.994638069705094e-05, "loss": 0.8758, "num_input_tokens_seen": 3092896, "step": 5365 }, { "epoch": 0.7998212689901698, "grad_norm": 0.9472329616546631, "learning_rate": 3.99836163240989e-05, "loss": 0.8116, "num_input_tokens_seen": 3095872, "step": 5370 }, { "epoch": 0.800565981531129, "grad_norm": 1.0872668027877808, "learning_rate": 4.002085195114686e-05, "loss": 0.7823, "num_input_tokens_seen": 3098624, "step": 5375 }, { "epoch": 0.8013106940720882, "grad_norm": 0.9238336682319641, "learning_rate": 4.005808757819482e-05, "loss": 0.8608, "num_input_tokens_seen": 3101760, "step": 5380 }, { "epoch": 0.8020554066130474, "grad_norm": 0.8310611248016357, "learning_rate": 4.0095323205242776e-05, "loss": 0.8162, "num_input_tokens_seen": 3104800, "step": 5385 }, { "epoch": 0.8028001191540065, "grad_norm": 0.6808587312698364, "learning_rate": 4.013255883229074e-05, "loss": 0.7394, "num_input_tokens_seen": 3107712, "step": 5390 }, { "epoch": 0.8035448316949657, "grad_norm": 0.7556683421134949, "learning_rate": 4.01697944593387e-05, "loss": 0.8604, "num_input_tokens_seen": 3110720, "step": 5395 }, { "epoch": 0.8042895442359249, "grad_norm": 0.8667207360267639, "learning_rate": 4.020703008638665e-05, "loss": 0.8633, "num_input_tokens_seen": 3113792, "step": 5400 }, { "epoch": 0.8050342567768841, "grad_norm": 0.9123275876045227, "learning_rate": 4.0244265713434616e-05, "loss": 0.8357, "num_input_tokens_seen": 3116736, "step": 5405 }, { "epoch": 0.8057789693178433, "grad_norm": 1.138709306716919, "learning_rate": 4.028150134048257e-05, "loss": 0.777, "num_input_tokens_seen": 3119584, "step": 5410 }, { "epoch": 0.8065236818588025, "grad_norm": 1.108164668083191, "learning_rate": 4.0318736967530536e-05, "loss": 0.7763, "num_input_tokens_seen": 3122432, "step": 5415 }, { "epoch": 0.8072683943997617, "grad_norm": 0.8302445411682129, "learning_rate": 4.035597259457849e-05, "loss": 0.853, "num_input_tokens_seen": 3125184, "step": 5420 }, { "epoch": 0.8080131069407209, "grad_norm": 1.1363699436187744, "learning_rate": 4.0393208221626455e-05, "loss": 0.8641, "num_input_tokens_seen": 3128000, "step": 5425 }, { "epoch": 0.80875781948168, "grad_norm": 1.0124270915985107, "learning_rate": 4.043044384867442e-05, "loss": 0.7792, "num_input_tokens_seen": 3130944, "step": 5430 }, { "epoch": 0.8095025320226392, "grad_norm": 0.8372935652732849, "learning_rate": 4.0467679475722375e-05, "loss": 0.8809, "num_input_tokens_seen": 3133984, "step": 5435 }, { "epoch": 0.8102472445635984, "grad_norm": 1.1308504343032837, "learning_rate": 4.050491510277033e-05, "loss": 0.829, "num_input_tokens_seen": 3137120, "step": 5440 }, { "epoch": 0.8109919571045576, "grad_norm": 0.6412436366081238, "learning_rate": 4.054215072981829e-05, "loss": 0.8836, "num_input_tokens_seen": 3139840, "step": 5445 }, { "epoch": 0.8117366696455168, "grad_norm": 1.0898211002349854, "learning_rate": 4.057938635686625e-05, "loss": 0.799, "num_input_tokens_seen": 3142592, "step": 5450 }, { "epoch": 0.812481382186476, "grad_norm": 1.0559715032577515, "learning_rate": 4.061662198391421e-05, "loss": 0.8206, "num_input_tokens_seen": 3145248, "step": 5455 }, { "epoch": 0.8132260947274352, "grad_norm": 1.0253123044967651, "learning_rate": 4.065385761096217e-05, "loss": 0.8422, "num_input_tokens_seen": 3148096, "step": 5460 }, { "epoch": 0.8139708072683944, "grad_norm": 0.8731471300125122, "learning_rate": 4.069109323801013e-05, "loss": 0.7886, "num_input_tokens_seen": 3151008, "step": 5465 }, { "epoch": 0.8147155198093536, "grad_norm": 0.9190864562988281, "learning_rate": 4.072832886505809e-05, "loss": 0.8484, "num_input_tokens_seen": 3153920, "step": 5470 }, { "epoch": 0.8154602323503127, "grad_norm": 0.8507480621337891, "learning_rate": 4.0765564492106054e-05, "loss": 0.8288, "num_input_tokens_seen": 3156448, "step": 5475 }, { "epoch": 0.8162049448912719, "grad_norm": 0.9469449520111084, "learning_rate": 4.080280011915401e-05, "loss": 0.8223, "num_input_tokens_seen": 3159552, "step": 5480 }, { "epoch": 0.8169496574322311, "grad_norm": 1.099656581878662, "learning_rate": 4.084003574620197e-05, "loss": 0.8266, "num_input_tokens_seen": 3162240, "step": 5485 }, { "epoch": 0.8176943699731903, "grad_norm": 0.8766738772392273, "learning_rate": 4.087727137324992e-05, "loss": 0.804, "num_input_tokens_seen": 3164608, "step": 5490 }, { "epoch": 0.8184390825141495, "grad_norm": 0.9144515991210938, "learning_rate": 4.0914507000297886e-05, "loss": 0.8266, "num_input_tokens_seen": 3167520, "step": 5495 }, { "epoch": 0.8191837950551087, "grad_norm": 1.2989274263381958, "learning_rate": 4.095174262734584e-05, "loss": 0.7994, "num_input_tokens_seen": 3170400, "step": 5500 }, { "epoch": 0.819928507596068, "grad_norm": 0.9484414458274841, "learning_rate": 4.0988978254393806e-05, "loss": 0.8574, "num_input_tokens_seen": 3173376, "step": 5505 }, { "epoch": 0.8206732201370271, "grad_norm": 0.9530901312828064, "learning_rate": 4.102621388144177e-05, "loss": 0.8639, "num_input_tokens_seen": 3176256, "step": 5510 }, { "epoch": 0.8214179326779864, "grad_norm": 1.033367395401001, "learning_rate": 4.1063449508489726e-05, "loss": 0.826, "num_input_tokens_seen": 3179136, "step": 5515 }, { "epoch": 0.8221626452189454, "grad_norm": 0.9244956374168396, "learning_rate": 4.110068513553769e-05, "loss": 0.8664, "num_input_tokens_seen": 3181920, "step": 5520 }, { "epoch": 0.8229073577599046, "grad_norm": 1.4259613752365112, "learning_rate": 4.1137920762585646e-05, "loss": 0.8762, "num_input_tokens_seen": 3184736, "step": 5525 }, { "epoch": 0.8236520703008638, "grad_norm": 0.8427379131317139, "learning_rate": 4.11751563896336e-05, "loss": 0.822, "num_input_tokens_seen": 3187456, "step": 5530 }, { "epoch": 0.824396782841823, "grad_norm": 0.8572449088096619, "learning_rate": 4.121239201668156e-05, "loss": 0.8703, "num_input_tokens_seen": 3190240, "step": 5535 }, { "epoch": 0.8251414953827823, "grad_norm": 1.074676513671875, "learning_rate": 4.124962764372952e-05, "loss": 0.8246, "num_input_tokens_seen": 3192864, "step": 5540 }, { "epoch": 0.8258862079237415, "grad_norm": 1.1309664249420166, "learning_rate": 4.1286863270777485e-05, "loss": 0.8716, "num_input_tokens_seen": 3195392, "step": 5545 }, { "epoch": 0.8266309204647007, "grad_norm": 1.0495803356170654, "learning_rate": 4.132409889782544e-05, "loss": 0.7911, "num_input_tokens_seen": 3198176, "step": 5550 }, { "epoch": 0.8273756330056599, "grad_norm": 0.9541231393814087, "learning_rate": 4.1361334524873405e-05, "loss": 0.7978, "num_input_tokens_seen": 3201152, "step": 5555 }, { "epoch": 0.828120345546619, "grad_norm": 0.9428048729896545, "learning_rate": 4.139857015192136e-05, "loss": 0.9103, "num_input_tokens_seen": 3204320, "step": 5560 }, { "epoch": 0.8288650580875782, "grad_norm": 0.9894550442695618, "learning_rate": 4.1435805778969324e-05, "loss": 0.8277, "num_input_tokens_seen": 3207008, "step": 5565 }, { "epoch": 0.8296097706285374, "grad_norm": 0.9547988176345825, "learning_rate": 4.1473041406017274e-05, "loss": 0.8292, "num_input_tokens_seen": 3209760, "step": 5570 }, { "epoch": 0.8303544831694966, "grad_norm": 0.8899568319320679, "learning_rate": 4.151027703306524e-05, "loss": 0.9055, "num_input_tokens_seen": 3212864, "step": 5575 }, { "epoch": 0.8310991957104558, "grad_norm": 1.0157668590545654, "learning_rate": 4.1547512660113194e-05, "loss": 0.8421, "num_input_tokens_seen": 3215744, "step": 5580 }, { "epoch": 0.831843908251415, "grad_norm": 0.8239426612854004, "learning_rate": 4.158474828716116e-05, "loss": 0.8441, "num_input_tokens_seen": 3218400, "step": 5585 }, { "epoch": 0.8325886207923742, "grad_norm": 0.9898437857627869, "learning_rate": 4.162198391420912e-05, "loss": 0.8217, "num_input_tokens_seen": 3220960, "step": 5590 }, { "epoch": 0.8333333333333334, "grad_norm": 0.9356661438941956, "learning_rate": 4.165921954125708e-05, "loss": 0.8674, "num_input_tokens_seen": 3223872, "step": 5595 }, { "epoch": 0.8340780458742926, "grad_norm": 1.3489065170288086, "learning_rate": 4.169645516830504e-05, "loss": 0.8007, "num_input_tokens_seen": 3226880, "step": 5600 }, { "epoch": 0.8348227584152517, "grad_norm": 1.1643887758255005, "learning_rate": 4.1733690795352996e-05, "loss": 0.7591, "num_input_tokens_seen": 3229920, "step": 5605 }, { "epoch": 0.8355674709562109, "grad_norm": 1.092225432395935, "learning_rate": 4.177092642240096e-05, "loss": 0.8607, "num_input_tokens_seen": 3232768, "step": 5610 }, { "epoch": 0.8363121834971701, "grad_norm": 0.8887239694595337, "learning_rate": 4.180816204944891e-05, "loss": 0.7808, "num_input_tokens_seen": 3235808, "step": 5615 }, { "epoch": 0.8370568960381293, "grad_norm": 1.2399479150772095, "learning_rate": 4.184539767649687e-05, "loss": 0.8048, "num_input_tokens_seen": 3238400, "step": 5620 }, { "epoch": 0.8378016085790885, "grad_norm": 1.163072109222412, "learning_rate": 4.1882633303544836e-05, "loss": 0.8649, "num_input_tokens_seen": 3241216, "step": 5625 }, { "epoch": 0.8385463211200477, "grad_norm": 0.877793550491333, "learning_rate": 4.191986893059279e-05, "loss": 0.7945, "num_input_tokens_seen": 3244320, "step": 5630 }, { "epoch": 0.8392910336610069, "grad_norm": 0.8849369287490845, "learning_rate": 4.1957104557640756e-05, "loss": 0.7777, "num_input_tokens_seen": 3247200, "step": 5635 }, { "epoch": 0.8400357462019661, "grad_norm": 0.7105191946029663, "learning_rate": 4.199434018468871e-05, "loss": 0.863, "num_input_tokens_seen": 3250144, "step": 5640 }, { "epoch": 0.8407804587429253, "grad_norm": 1.064819097518921, "learning_rate": 4.2031575811736675e-05, "loss": 0.8369, "num_input_tokens_seen": 3252928, "step": 5645 }, { "epoch": 0.8415251712838844, "grad_norm": 0.9285297393798828, "learning_rate": 4.206881143878463e-05, "loss": 0.7442, "num_input_tokens_seen": 3256160, "step": 5650 }, { "epoch": 0.8422698838248436, "grad_norm": 0.8522478342056274, "learning_rate": 4.2106047065832595e-05, "loss": 0.8173, "num_input_tokens_seen": 3259008, "step": 5655 }, { "epoch": 0.8430145963658028, "grad_norm": 1.0039350986480713, "learning_rate": 4.2143282692880545e-05, "loss": 0.8767, "num_input_tokens_seen": 3261824, "step": 5660 }, { "epoch": 0.843759308906762, "grad_norm": 0.9334301948547363, "learning_rate": 4.218051831992851e-05, "loss": 0.8442, "num_input_tokens_seen": 3264992, "step": 5665 }, { "epoch": 0.8445040214477212, "grad_norm": 0.8933903574943542, "learning_rate": 4.221775394697647e-05, "loss": 0.8838, "num_input_tokens_seen": 3268032, "step": 5670 }, { "epoch": 0.8452487339886804, "grad_norm": 0.9013721346855164, "learning_rate": 4.225498957402443e-05, "loss": 0.8319, "num_input_tokens_seen": 3271008, "step": 5675 }, { "epoch": 0.8459934465296396, "grad_norm": 0.7677609920501709, "learning_rate": 4.229222520107239e-05, "loss": 0.8097, "num_input_tokens_seen": 3274080, "step": 5680 }, { "epoch": 0.8467381590705988, "grad_norm": 0.7083768844604492, "learning_rate": 4.232946082812035e-05, "loss": 0.7989, "num_input_tokens_seen": 3276768, "step": 5685 }, { "epoch": 0.8474828716115579, "grad_norm": 0.7876479625701904, "learning_rate": 4.236669645516831e-05, "loss": 0.861, "num_input_tokens_seen": 3279552, "step": 5690 }, { "epoch": 0.8482275841525171, "grad_norm": 1.1863534450531006, "learning_rate": 4.240393208221627e-05, "loss": 0.8348, "num_input_tokens_seen": 3282112, "step": 5695 }, { "epoch": 0.8489722966934763, "grad_norm": 0.8480328321456909, "learning_rate": 4.2441167709264223e-05, "loss": 0.7839, "num_input_tokens_seen": 3284832, "step": 5700 }, { "epoch": 0.8497170092344355, "grad_norm": 0.7998594045639038, "learning_rate": 4.247840333631219e-05, "loss": 0.8857, "num_input_tokens_seen": 3287680, "step": 5705 }, { "epoch": 0.8504617217753947, "grad_norm": 1.0998815298080444, "learning_rate": 4.251563896336014e-05, "loss": 0.8379, "num_input_tokens_seen": 3290752, "step": 5710 }, { "epoch": 0.8512064343163539, "grad_norm": 0.7254759073257446, "learning_rate": 4.2552874590408106e-05, "loss": 0.8162, "num_input_tokens_seen": 3293440, "step": 5715 }, { "epoch": 0.8519511468573131, "grad_norm": 0.8484064340591431, "learning_rate": 4.259011021745606e-05, "loss": 0.7898, "num_input_tokens_seen": 3296320, "step": 5720 }, { "epoch": 0.8526958593982723, "grad_norm": 0.9647995233535767, "learning_rate": 4.2627345844504026e-05, "loss": 0.8075, "num_input_tokens_seen": 3299296, "step": 5725 }, { "epoch": 0.8534405719392315, "grad_norm": 1.3578671216964722, "learning_rate": 4.266458147155198e-05, "loss": 0.8127, "num_input_tokens_seen": 3302176, "step": 5730 }, { "epoch": 0.8541852844801906, "grad_norm": 0.814697802066803, "learning_rate": 4.2701817098599946e-05, "loss": 0.8063, "num_input_tokens_seen": 3304992, "step": 5735 }, { "epoch": 0.8549299970211498, "grad_norm": 1.2125736474990845, "learning_rate": 4.27390527256479e-05, "loss": 0.8157, "num_input_tokens_seen": 3307968, "step": 5740 }, { "epoch": 0.855674709562109, "grad_norm": 0.943962037563324, "learning_rate": 4.277628835269586e-05, "loss": 0.8611, "num_input_tokens_seen": 3310944, "step": 5745 }, { "epoch": 0.8564194221030682, "grad_norm": 1.153760552406311, "learning_rate": 4.281352397974382e-05, "loss": 0.7721, "num_input_tokens_seen": 3313856, "step": 5750 }, { "epoch": 0.8571641346440274, "grad_norm": 0.9943203926086426, "learning_rate": 4.285075960679178e-05, "loss": 0.8099, "num_input_tokens_seen": 3316640, "step": 5755 }, { "epoch": 0.8579088471849866, "grad_norm": 0.7800800800323486, "learning_rate": 4.288799523383974e-05, "loss": 0.8227, "num_input_tokens_seen": 3319520, "step": 5760 }, { "epoch": 0.8586535597259458, "grad_norm": 0.9118934869766235, "learning_rate": 4.29252308608877e-05, "loss": 0.8812, "num_input_tokens_seen": 3322144, "step": 5765 }, { "epoch": 0.859398272266905, "grad_norm": 0.818420946598053, "learning_rate": 4.296246648793566e-05, "loss": 0.8217, "num_input_tokens_seen": 3324864, "step": 5770 }, { "epoch": 0.8601429848078642, "grad_norm": 0.9022679924964905, "learning_rate": 4.299970211498362e-05, "loss": 0.8211, "num_input_tokens_seen": 3327712, "step": 5775 }, { "epoch": 0.8608876973488233, "grad_norm": 0.8727918267250061, "learning_rate": 4.303693774203158e-05, "loss": 0.8342, "num_input_tokens_seen": 3330592, "step": 5780 }, { "epoch": 0.8616324098897825, "grad_norm": 0.9307408332824707, "learning_rate": 4.307417336907954e-05, "loss": 0.8538, "num_input_tokens_seen": 3333632, "step": 5785 }, { "epoch": 0.8623771224307417, "grad_norm": 0.9156671762466431, "learning_rate": 4.3111408996127494e-05, "loss": 0.8405, "num_input_tokens_seen": 3336384, "step": 5790 }, { "epoch": 0.8631218349717009, "grad_norm": 1.0624783039093018, "learning_rate": 4.314864462317546e-05, "loss": 0.8439, "num_input_tokens_seen": 3339424, "step": 5795 }, { "epoch": 0.8638665475126601, "grad_norm": 1.0781224966049194, "learning_rate": 4.3185880250223414e-05, "loss": 0.8093, "num_input_tokens_seen": 3342176, "step": 5800 }, { "epoch": 0.8646112600536193, "grad_norm": 0.8219512701034546, "learning_rate": 4.322311587727138e-05, "loss": 0.8067, "num_input_tokens_seen": 3345152, "step": 5805 }, { "epoch": 0.8653559725945785, "grad_norm": 0.8237998485565186, "learning_rate": 4.3260351504319333e-05, "loss": 0.7923, "num_input_tokens_seen": 3348032, "step": 5810 }, { "epoch": 0.8661006851355377, "grad_norm": 0.7975049018859863, "learning_rate": 4.32975871313673e-05, "loss": 0.8674, "num_input_tokens_seen": 3351008, "step": 5815 }, { "epoch": 0.8668453976764968, "grad_norm": 0.7989259958267212, "learning_rate": 4.333482275841525e-05, "loss": 0.8419, "num_input_tokens_seen": 3354176, "step": 5820 }, { "epoch": 0.867590110217456, "grad_norm": 1.2257256507873535, "learning_rate": 4.3372058385463216e-05, "loss": 0.8429, "num_input_tokens_seen": 3357248, "step": 5825 }, { "epoch": 0.8683348227584152, "grad_norm": 0.9860101342201233, "learning_rate": 4.340929401251117e-05, "loss": 0.8838, "num_input_tokens_seen": 3360160, "step": 5830 }, { "epoch": 0.8690795352993744, "grad_norm": 0.8900704383850098, "learning_rate": 4.344652963955913e-05, "loss": 0.8379, "num_input_tokens_seen": 3362944, "step": 5835 }, { "epoch": 0.8698242478403336, "grad_norm": 0.7963668704032898, "learning_rate": 4.348376526660709e-05, "loss": 0.8079, "num_input_tokens_seen": 3365984, "step": 5840 }, { "epoch": 0.8705689603812928, "grad_norm": 0.6544823050498962, "learning_rate": 4.352100089365505e-05, "loss": 0.8253, "num_input_tokens_seen": 3368864, "step": 5845 }, { "epoch": 0.871313672922252, "grad_norm": 1.2677230834960938, "learning_rate": 4.355823652070301e-05, "loss": 0.8497, "num_input_tokens_seen": 3371776, "step": 5850 }, { "epoch": 0.8720583854632112, "grad_norm": 1.044254183769226, "learning_rate": 4.359547214775097e-05, "loss": 0.832, "num_input_tokens_seen": 3374816, "step": 5855 }, { "epoch": 0.8728030980041704, "grad_norm": 0.7713882327079773, "learning_rate": 4.363270777479893e-05, "loss": 0.855, "num_input_tokens_seen": 3377664, "step": 5860 }, { "epoch": 0.8735478105451295, "grad_norm": 1.0043988227844238, "learning_rate": 4.366994340184689e-05, "loss": 0.823, "num_input_tokens_seen": 3380288, "step": 5865 }, { "epoch": 0.8742925230860887, "grad_norm": 0.8854109644889832, "learning_rate": 4.370717902889485e-05, "loss": 0.82, "num_input_tokens_seen": 3383328, "step": 5870 }, { "epoch": 0.8750372356270479, "grad_norm": 0.7648451328277588, "learning_rate": 4.374441465594281e-05, "loss": 0.7765, "num_input_tokens_seen": 3386240, "step": 5875 }, { "epoch": 0.8757819481680071, "grad_norm": 0.90835040807724, "learning_rate": 4.3781650282990765e-05, "loss": 0.7817, "num_input_tokens_seen": 3388960, "step": 5880 }, { "epoch": 0.8765266607089663, "grad_norm": 0.9489257335662842, "learning_rate": 4.381888591003873e-05, "loss": 0.8464, "num_input_tokens_seen": 3391744, "step": 5885 }, { "epoch": 0.8772713732499255, "grad_norm": 1.7540887594223022, "learning_rate": 4.3856121537086684e-05, "loss": 0.8316, "num_input_tokens_seen": 3394400, "step": 5890 }, { "epoch": 0.8780160857908847, "grad_norm": 0.7325718402862549, "learning_rate": 4.389335716413465e-05, "loss": 0.8531, "num_input_tokens_seen": 3397280, "step": 5895 }, { "epoch": 0.878760798331844, "grad_norm": 0.8961846828460693, "learning_rate": 4.3930592791182604e-05, "loss": 0.8227, "num_input_tokens_seen": 3400160, "step": 5900 }, { "epoch": 0.8795055108728032, "grad_norm": 0.8583387732505798, "learning_rate": 4.396782841823057e-05, "loss": 0.857, "num_input_tokens_seen": 3403392, "step": 5905 }, { "epoch": 0.8802502234137622, "grad_norm": 1.1471123695373535, "learning_rate": 4.4005064045278524e-05, "loss": 0.8756, "num_input_tokens_seen": 3406432, "step": 5910 }, { "epoch": 0.8809949359547214, "grad_norm": 0.9523840546607971, "learning_rate": 4.404229967232648e-05, "loss": 0.8398, "num_input_tokens_seen": 3409248, "step": 5915 }, { "epoch": 0.8817396484956807, "grad_norm": 0.782649040222168, "learning_rate": 4.4079535299374443e-05, "loss": 0.8178, "num_input_tokens_seen": 3412064, "step": 5920 }, { "epoch": 0.8824843610366399, "grad_norm": 1.00520658493042, "learning_rate": 4.41167709264224e-05, "loss": 0.8301, "num_input_tokens_seen": 3414816, "step": 5925 }, { "epoch": 0.883229073577599, "grad_norm": 0.9770618081092834, "learning_rate": 4.415400655347036e-05, "loss": 0.8982, "num_input_tokens_seen": 3417664, "step": 5930 }, { "epoch": 0.8839737861185583, "grad_norm": 1.0237345695495605, "learning_rate": 4.419124218051832e-05, "loss": 0.8073, "num_input_tokens_seen": 3420448, "step": 5935 }, { "epoch": 0.8847184986595175, "grad_norm": 1.389565110206604, "learning_rate": 4.422847780756628e-05, "loss": 0.8495, "num_input_tokens_seen": 3423136, "step": 5940 }, { "epoch": 0.8854632112004767, "grad_norm": 0.736701250076294, "learning_rate": 4.426571343461424e-05, "loss": 0.8342, "num_input_tokens_seen": 3425792, "step": 5945 }, { "epoch": 0.8862079237414358, "grad_norm": 0.9016003012657166, "learning_rate": 4.43029490616622e-05, "loss": 0.8136, "num_input_tokens_seen": 3428544, "step": 5950 }, { "epoch": 0.886952636282395, "grad_norm": 0.7582675814628601, "learning_rate": 4.4340184688710166e-05, "loss": 0.7997, "num_input_tokens_seen": 3431200, "step": 5955 }, { "epoch": 0.8876973488233542, "grad_norm": 1.0300041437149048, "learning_rate": 4.4377420315758115e-05, "loss": 0.818, "num_input_tokens_seen": 3433760, "step": 5960 }, { "epoch": 0.8884420613643134, "grad_norm": 0.7939468622207642, "learning_rate": 4.441465594280608e-05, "loss": 0.8105, "num_input_tokens_seen": 3436896, "step": 5965 }, { "epoch": 0.8891867739052726, "grad_norm": 1.2775992155075073, "learning_rate": 4.4451891569854035e-05, "loss": 0.8279, "num_input_tokens_seen": 3440128, "step": 5970 }, { "epoch": 0.8899314864462318, "grad_norm": 0.9011468291282654, "learning_rate": 4.4489127196902e-05, "loss": 0.8176, "num_input_tokens_seen": 3443072, "step": 5975 }, { "epoch": 0.890676198987191, "grad_norm": 1.3893486261367798, "learning_rate": 4.4526362823949955e-05, "loss": 0.8992, "num_input_tokens_seen": 3445888, "step": 5980 }, { "epoch": 0.8914209115281502, "grad_norm": 0.885771632194519, "learning_rate": 4.456359845099792e-05, "loss": 0.8219, "num_input_tokens_seen": 3448672, "step": 5985 }, { "epoch": 0.8921656240691094, "grad_norm": 0.9554117918014526, "learning_rate": 4.4600834078045875e-05, "loss": 0.8521, "num_input_tokens_seen": 3451648, "step": 5990 }, { "epoch": 0.8929103366100685, "grad_norm": 0.839567244052887, "learning_rate": 4.463806970509384e-05, "loss": 0.812, "num_input_tokens_seen": 3454592, "step": 5995 }, { "epoch": 0.8936550491510277, "grad_norm": 1.1181761026382446, "learning_rate": 4.46753053321418e-05, "loss": 0.8234, "num_input_tokens_seen": 3457120, "step": 6000 }, { "epoch": 0.8943997616919869, "grad_norm": 0.8615491986274719, "learning_rate": 4.471254095918975e-05, "loss": 0.8496, "num_input_tokens_seen": 3460608, "step": 6005 }, { "epoch": 0.8951444742329461, "grad_norm": 0.8912745118141174, "learning_rate": 4.4749776586237714e-05, "loss": 0.8378, "num_input_tokens_seen": 3463744, "step": 6010 }, { "epoch": 0.8958891867739053, "grad_norm": 0.8388967514038086, "learning_rate": 4.478701221328567e-05, "loss": 0.8456, "num_input_tokens_seen": 3466560, "step": 6015 }, { "epoch": 0.8966338993148645, "grad_norm": 0.917101263999939, "learning_rate": 4.4824247840333634e-05, "loss": 0.8121, "num_input_tokens_seen": 3469376, "step": 6020 }, { "epoch": 0.8973786118558237, "grad_norm": 0.7578571438789368, "learning_rate": 4.486148346738159e-05, "loss": 0.8064, "num_input_tokens_seen": 3472288, "step": 6025 }, { "epoch": 0.8981233243967829, "grad_norm": 0.6942359209060669, "learning_rate": 4.4898719094429553e-05, "loss": 0.8387, "num_input_tokens_seen": 3474944, "step": 6030 }, { "epoch": 0.898868036937742, "grad_norm": 0.8654682040214539, "learning_rate": 4.493595472147752e-05, "loss": 0.87, "num_input_tokens_seen": 3477664, "step": 6035 }, { "epoch": 0.8996127494787012, "grad_norm": 0.745650053024292, "learning_rate": 4.497319034852547e-05, "loss": 0.8853, "num_input_tokens_seen": 3480416, "step": 6040 }, { "epoch": 0.9003574620196604, "grad_norm": 1.0751484632492065, "learning_rate": 4.501042597557343e-05, "loss": 0.8596, "num_input_tokens_seen": 3483136, "step": 6045 }, { "epoch": 0.9011021745606196, "grad_norm": 0.7243345975875854, "learning_rate": 4.5047661602621386e-05, "loss": 0.7899, "num_input_tokens_seen": 3485728, "step": 6050 }, { "epoch": 0.9018468871015788, "grad_norm": 0.8639854788780212, "learning_rate": 4.508489722966935e-05, "loss": 0.8205, "num_input_tokens_seen": 3488544, "step": 6055 }, { "epoch": 0.902591599642538, "grad_norm": 0.8247674703598022, "learning_rate": 4.5122132856717306e-05, "loss": 0.8192, "num_input_tokens_seen": 3491744, "step": 6060 }, { "epoch": 0.9033363121834972, "grad_norm": 0.9281846284866333, "learning_rate": 4.515936848376527e-05, "loss": 0.8229, "num_input_tokens_seen": 3494592, "step": 6065 }, { "epoch": 0.9040810247244564, "grad_norm": 0.9001657962799072, "learning_rate": 4.5196604110813225e-05, "loss": 0.8114, "num_input_tokens_seen": 3497344, "step": 6070 }, { "epoch": 0.9048257372654156, "grad_norm": 0.7239602208137512, "learning_rate": 4.523383973786119e-05, "loss": 0.7932, "num_input_tokens_seen": 3500224, "step": 6075 }, { "epoch": 0.9055704498063747, "grad_norm": 0.7693927884101868, "learning_rate": 4.527107536490915e-05, "loss": 0.7958, "num_input_tokens_seen": 3503040, "step": 6080 }, { "epoch": 0.9063151623473339, "grad_norm": 0.905077338218689, "learning_rate": 4.530831099195711e-05, "loss": 0.8395, "num_input_tokens_seen": 3506016, "step": 6085 }, { "epoch": 0.9070598748882931, "grad_norm": 0.7948490381240845, "learning_rate": 4.5345546619005065e-05, "loss": 0.8075, "num_input_tokens_seen": 3509088, "step": 6090 }, { "epoch": 0.9078045874292523, "grad_norm": 0.9864904880523682, "learning_rate": 4.538278224605302e-05, "loss": 0.738, "num_input_tokens_seen": 3511904, "step": 6095 }, { "epoch": 0.9085492999702115, "grad_norm": 1.0185368061065674, "learning_rate": 4.5420017873100985e-05, "loss": 0.7803, "num_input_tokens_seen": 3514624, "step": 6100 }, { "epoch": 0.9092940125111707, "grad_norm": 0.7849475145339966, "learning_rate": 4.545725350014894e-05, "loss": 0.7887, "num_input_tokens_seen": 3517408, "step": 6105 }, { "epoch": 0.9100387250521299, "grad_norm": 0.8613041639328003, "learning_rate": 4.5494489127196904e-05, "loss": 0.8291, "num_input_tokens_seen": 3520576, "step": 6110 }, { "epoch": 0.9107834375930891, "grad_norm": 0.800410270690918, "learning_rate": 4.553172475424487e-05, "loss": 0.936, "num_input_tokens_seen": 3523424, "step": 6115 }, { "epoch": 0.9115281501340483, "grad_norm": 0.9301359057426453, "learning_rate": 4.5568960381292824e-05, "loss": 0.8522, "num_input_tokens_seen": 3526400, "step": 6120 }, { "epoch": 0.9122728626750074, "grad_norm": 0.9564660787582397, "learning_rate": 4.560619600834079e-05, "loss": 0.9045, "num_input_tokens_seen": 3529952, "step": 6125 }, { "epoch": 0.9130175752159666, "grad_norm": 0.8881258964538574, "learning_rate": 4.5643431635388744e-05, "loss": 0.9117, "num_input_tokens_seen": 3532992, "step": 6130 }, { "epoch": 0.9137622877569258, "grad_norm": 0.8687940835952759, "learning_rate": 4.56806672624367e-05, "loss": 0.816, "num_input_tokens_seen": 3536096, "step": 6135 }, { "epoch": 0.914507000297885, "grad_norm": 0.851625382900238, "learning_rate": 4.5717902889484657e-05, "loss": 0.7844, "num_input_tokens_seen": 3539456, "step": 6140 }, { "epoch": 0.9152517128388442, "grad_norm": 0.5863285660743713, "learning_rate": 4.575513851653262e-05, "loss": 0.8242, "num_input_tokens_seen": 3542464, "step": 6145 }, { "epoch": 0.9159964253798034, "grad_norm": 0.7148517370223999, "learning_rate": 4.579237414358058e-05, "loss": 0.8642, "num_input_tokens_seen": 3545632, "step": 6150 }, { "epoch": 0.9167411379207626, "grad_norm": 0.7093276977539062, "learning_rate": 4.582960977062854e-05, "loss": 0.8139, "num_input_tokens_seen": 3548640, "step": 6155 }, { "epoch": 0.9174858504617218, "grad_norm": 1.0088222026824951, "learning_rate": 4.58668453976765e-05, "loss": 0.7963, "num_input_tokens_seen": 3551424, "step": 6160 }, { "epoch": 0.9182305630026809, "grad_norm": 0.5499330163002014, "learning_rate": 4.590408102472446e-05, "loss": 0.8205, "num_input_tokens_seen": 3554080, "step": 6165 }, { "epoch": 0.9189752755436401, "grad_norm": 0.6665949821472168, "learning_rate": 4.594131665177242e-05, "loss": 0.8148, "num_input_tokens_seen": 3556960, "step": 6170 }, { "epoch": 0.9197199880845993, "grad_norm": 1.2765365839004517, "learning_rate": 4.597855227882037e-05, "loss": 0.7726, "num_input_tokens_seen": 3560064, "step": 6175 }, { "epoch": 0.9204647006255585, "grad_norm": 0.8564407229423523, "learning_rate": 4.6015787905868335e-05, "loss": 0.8501, "num_input_tokens_seen": 3563040, "step": 6180 }, { "epoch": 0.9212094131665177, "grad_norm": 0.8364012837409973, "learning_rate": 4.605302353291629e-05, "loss": 0.7977, "num_input_tokens_seen": 3566272, "step": 6185 }, { "epoch": 0.9219541257074769, "grad_norm": 0.7988649606704712, "learning_rate": 4.6090259159964255e-05, "loss": 0.8765, "num_input_tokens_seen": 3569120, "step": 6190 }, { "epoch": 0.9226988382484361, "grad_norm": 0.7860931754112244, "learning_rate": 4.612749478701222e-05, "loss": 0.8507, "num_input_tokens_seen": 3572064, "step": 6195 }, { "epoch": 0.9234435507893953, "grad_norm": 0.7373763918876648, "learning_rate": 4.6164730414060175e-05, "loss": 0.8535, "num_input_tokens_seen": 3574944, "step": 6200 }, { "epoch": 0.9241882633303545, "grad_norm": 0.811471164226532, "learning_rate": 4.620196604110814e-05, "loss": 0.8174, "num_input_tokens_seen": 3578176, "step": 6205 }, { "epoch": 0.9249329758713136, "grad_norm": 0.8981873989105225, "learning_rate": 4.6239201668156095e-05, "loss": 0.7596, "num_input_tokens_seen": 3581280, "step": 6210 }, { "epoch": 0.9256776884122728, "grad_norm": 0.8150624632835388, "learning_rate": 4.627643729520406e-05, "loss": 0.7715, "num_input_tokens_seen": 3584064, "step": 6215 }, { "epoch": 0.926422400953232, "grad_norm": 1.2218635082244873, "learning_rate": 4.631367292225201e-05, "loss": 0.8621, "num_input_tokens_seen": 3587200, "step": 6220 }, { "epoch": 0.9271671134941912, "grad_norm": 1.0525901317596436, "learning_rate": 4.635090854929997e-05, "loss": 0.8566, "num_input_tokens_seen": 3590208, "step": 6225 }, { "epoch": 0.9279118260351504, "grad_norm": 0.9927219152450562, "learning_rate": 4.6388144176347934e-05, "loss": 0.8153, "num_input_tokens_seen": 3593152, "step": 6230 }, { "epoch": 0.9286565385761096, "grad_norm": 0.9003706574440002, "learning_rate": 4.642537980339589e-05, "loss": 0.8075, "num_input_tokens_seen": 3596160, "step": 6235 }, { "epoch": 0.9294012511170688, "grad_norm": 0.8151739835739136, "learning_rate": 4.6462615430443854e-05, "loss": 0.8478, "num_input_tokens_seen": 3599104, "step": 6240 }, { "epoch": 0.930145963658028, "grad_norm": 0.8916120529174805, "learning_rate": 4.649985105749181e-05, "loss": 0.7987, "num_input_tokens_seen": 3602144, "step": 6245 }, { "epoch": 0.9308906761989872, "grad_norm": 0.9742170572280884, "learning_rate": 4.653708668453977e-05, "loss": 0.9406, "num_input_tokens_seen": 3605280, "step": 6250 }, { "epoch": 0.9316353887399463, "grad_norm": 0.9765986800193787, "learning_rate": 4.657432231158773e-05, "loss": 0.8712, "num_input_tokens_seen": 3608128, "step": 6255 }, { "epoch": 0.9323801012809055, "grad_norm": 0.6546464562416077, "learning_rate": 4.661155793863569e-05, "loss": 0.8127, "num_input_tokens_seen": 3610624, "step": 6260 }, { "epoch": 0.9331248138218647, "grad_norm": 0.7561929225921631, "learning_rate": 4.664879356568364e-05, "loss": 0.8115, "num_input_tokens_seen": 3613504, "step": 6265 }, { "epoch": 0.9338695263628239, "grad_norm": 0.8142368197441101, "learning_rate": 4.6686029192731606e-05, "loss": 0.8255, "num_input_tokens_seen": 3616320, "step": 6270 }, { "epoch": 0.9346142389037831, "grad_norm": 0.9651030898094177, "learning_rate": 4.672326481977957e-05, "loss": 0.8535, "num_input_tokens_seen": 3619104, "step": 6275 }, { "epoch": 0.9353589514447423, "grad_norm": 0.8841349482536316, "learning_rate": 4.6760500446827526e-05, "loss": 0.8244, "num_input_tokens_seen": 3621792, "step": 6280 }, { "epoch": 0.9361036639857016, "grad_norm": 0.8006298542022705, "learning_rate": 4.679773607387549e-05, "loss": 0.8951, "num_input_tokens_seen": 3624544, "step": 6285 }, { "epoch": 0.9368483765266608, "grad_norm": 1.0692481994628906, "learning_rate": 4.6834971700923445e-05, "loss": 0.8356, "num_input_tokens_seen": 3627424, "step": 6290 }, { "epoch": 0.9375930890676198, "grad_norm": 0.7015891075134277, "learning_rate": 4.687220732797141e-05, "loss": 0.82, "num_input_tokens_seen": 3630304, "step": 6295 }, { "epoch": 0.938337801608579, "grad_norm": 0.674673855304718, "learning_rate": 4.6909442955019365e-05, "loss": 0.7969, "num_input_tokens_seen": 3633216, "step": 6300 }, { "epoch": 0.9390825141495382, "grad_norm": 0.868369996547699, "learning_rate": 4.694667858206732e-05, "loss": 0.8572, "num_input_tokens_seen": 3636000, "step": 6305 }, { "epoch": 0.9398272266904975, "grad_norm": 0.927507221698761, "learning_rate": 4.6983914209115285e-05, "loss": 0.7888, "num_input_tokens_seen": 3638848, "step": 6310 }, { "epoch": 0.9405719392314567, "grad_norm": 0.7367343902587891, "learning_rate": 4.702114983616324e-05, "loss": 0.8347, "num_input_tokens_seen": 3642144, "step": 6315 }, { "epoch": 0.9413166517724159, "grad_norm": 0.9558299779891968, "learning_rate": 4.7058385463211205e-05, "loss": 0.8061, "num_input_tokens_seen": 3644832, "step": 6320 }, { "epoch": 0.9420613643133751, "grad_norm": 0.922283947467804, "learning_rate": 4.709562109025916e-05, "loss": 0.8383, "num_input_tokens_seen": 3647680, "step": 6325 }, { "epoch": 0.9428060768543343, "grad_norm": 1.0635024309158325, "learning_rate": 4.7132856717307124e-05, "loss": 0.8883, "num_input_tokens_seen": 3650528, "step": 6330 }, { "epoch": 0.9435507893952935, "grad_norm": 0.6619310975074768, "learning_rate": 4.717009234435508e-05, "loss": 0.8436, "num_input_tokens_seen": 3653344, "step": 6335 }, { "epoch": 0.9442955019362526, "grad_norm": 0.9062244296073914, "learning_rate": 4.7207327971403044e-05, "loss": 0.8239, "num_input_tokens_seen": 3656416, "step": 6340 }, { "epoch": 0.9450402144772118, "grad_norm": 0.5634430050849915, "learning_rate": 4.7244563598451e-05, "loss": 0.8196, "num_input_tokens_seen": 3659200, "step": 6345 }, { "epoch": 0.945784927018171, "grad_norm": 1.1168267726898193, "learning_rate": 4.728179922549896e-05, "loss": 0.8731, "num_input_tokens_seen": 3662144, "step": 6350 }, { "epoch": 0.9465296395591302, "grad_norm": 1.1220011711120605, "learning_rate": 4.731903485254692e-05, "loss": 0.8149, "num_input_tokens_seen": 3664768, "step": 6355 }, { "epoch": 0.9472743521000894, "grad_norm": 0.8273899555206299, "learning_rate": 4.7356270479594877e-05, "loss": 0.829, "num_input_tokens_seen": 3667296, "step": 6360 }, { "epoch": 0.9480190646410486, "grad_norm": 0.9411917328834534, "learning_rate": 4.739350610664284e-05, "loss": 0.8045, "num_input_tokens_seen": 3670112, "step": 6365 }, { "epoch": 0.9487637771820078, "grad_norm": 0.6349847316741943, "learning_rate": 4.7430741733690796e-05, "loss": 0.7963, "num_input_tokens_seen": 3672800, "step": 6370 }, { "epoch": 0.949508489722967, "grad_norm": 0.903095006942749, "learning_rate": 4.746797736073876e-05, "loss": 0.83, "num_input_tokens_seen": 3675968, "step": 6375 }, { "epoch": 0.9502532022639262, "grad_norm": 1.130286455154419, "learning_rate": 4.7505212987786716e-05, "loss": 0.8597, "num_input_tokens_seen": 3678880, "step": 6380 }, { "epoch": 0.9509979148048853, "grad_norm": 0.7774851322174072, "learning_rate": 4.754244861483468e-05, "loss": 0.8191, "num_input_tokens_seen": 3681952, "step": 6385 }, { "epoch": 0.9517426273458445, "grad_norm": 1.074331283569336, "learning_rate": 4.7579684241882636e-05, "loss": 0.8032, "num_input_tokens_seen": 3684736, "step": 6390 }, { "epoch": 0.9524873398868037, "grad_norm": 0.9567838311195374, "learning_rate": 4.761691986893059e-05, "loss": 0.8018, "num_input_tokens_seen": 3687744, "step": 6395 }, { "epoch": 0.9532320524277629, "grad_norm": 0.5713664889335632, "learning_rate": 4.7654155495978555e-05, "loss": 0.9013, "num_input_tokens_seen": 3690848, "step": 6400 }, { "epoch": 0.9539767649687221, "grad_norm": 0.8682148456573486, "learning_rate": 4.769139112302651e-05, "loss": 0.8204, "num_input_tokens_seen": 3693856, "step": 6405 }, { "epoch": 0.9547214775096813, "grad_norm": 0.7105472683906555, "learning_rate": 4.7728626750074475e-05, "loss": 0.8365, "num_input_tokens_seen": 3696608, "step": 6410 }, { "epoch": 0.9554661900506405, "grad_norm": 0.6346561908721924, "learning_rate": 4.776586237712243e-05, "loss": 0.8367, "num_input_tokens_seen": 3699552, "step": 6415 }, { "epoch": 0.9562109025915997, "grad_norm": 0.6513233184814453, "learning_rate": 4.7803098004170395e-05, "loss": 0.8381, "num_input_tokens_seen": 3702176, "step": 6420 }, { "epoch": 0.9569556151325588, "grad_norm": 0.9287636876106262, "learning_rate": 4.784033363121835e-05, "loss": 0.8387, "num_input_tokens_seen": 3704960, "step": 6425 }, { "epoch": 0.957700327673518, "grad_norm": 0.7921863794326782, "learning_rate": 4.7877569258266315e-05, "loss": 0.8784, "num_input_tokens_seen": 3707584, "step": 6430 }, { "epoch": 0.9584450402144772, "grad_norm": 0.8126815557479858, "learning_rate": 4.791480488531427e-05, "loss": 0.8566, "num_input_tokens_seen": 3710240, "step": 6435 }, { "epoch": 0.9591897527554364, "grad_norm": 0.8188415169715881, "learning_rate": 4.795204051236223e-05, "loss": 0.8282, "num_input_tokens_seen": 3712960, "step": 6440 }, { "epoch": 0.9599344652963956, "grad_norm": 0.7111068964004517, "learning_rate": 4.798927613941019e-05, "loss": 0.7703, "num_input_tokens_seen": 3715552, "step": 6445 }, { "epoch": 0.9606791778373548, "grad_norm": 0.7998884320259094, "learning_rate": 4.802651176645815e-05, "loss": 0.8216, "num_input_tokens_seen": 3718304, "step": 6450 }, { "epoch": 0.961423890378314, "grad_norm": 1.0262055397033691, "learning_rate": 4.806374739350611e-05, "loss": 0.8276, "num_input_tokens_seen": 3721280, "step": 6455 }, { "epoch": 0.9621686029192732, "grad_norm": 0.9599863886833191, "learning_rate": 4.810098302055407e-05, "loss": 0.8426, "num_input_tokens_seen": 3724480, "step": 6460 }, { "epoch": 0.9629133154602324, "grad_norm": 0.8106785416603088, "learning_rate": 4.813821864760203e-05, "loss": 0.8111, "num_input_tokens_seen": 3727232, "step": 6465 }, { "epoch": 0.9636580280011915, "grad_norm": 0.6803930401802063, "learning_rate": 4.8175454274649987e-05, "loss": 0.826, "num_input_tokens_seen": 3729952, "step": 6470 }, { "epoch": 0.9644027405421507, "grad_norm": 1.1491830348968506, "learning_rate": 4.821268990169795e-05, "loss": 0.7912, "num_input_tokens_seen": 3732800, "step": 6475 }, { "epoch": 0.9651474530831099, "grad_norm": 0.7085845470428467, "learning_rate": 4.8249925528745906e-05, "loss": 0.8206, "num_input_tokens_seen": 3735488, "step": 6480 }, { "epoch": 0.9658921656240691, "grad_norm": 0.7608104348182678, "learning_rate": 4.828716115579386e-05, "loss": 0.8347, "num_input_tokens_seen": 3738304, "step": 6485 }, { "epoch": 0.9666368781650283, "grad_norm": 0.8205829858779907, "learning_rate": 4.8324396782841826e-05, "loss": 0.8463, "num_input_tokens_seen": 3741024, "step": 6490 }, { "epoch": 0.9673815907059875, "grad_norm": 0.6643754243850708, "learning_rate": 4.836163240988978e-05, "loss": 0.9678, "num_input_tokens_seen": 3744992, "step": 6495 }, { "epoch": 0.9681263032469467, "grad_norm": 0.8599901795387268, "learning_rate": 4.8398868036937746e-05, "loss": 0.8388, "num_input_tokens_seen": 3747840, "step": 6500 }, { "epoch": 0.9688710157879059, "grad_norm": 0.8739225268363953, "learning_rate": 4.84361036639857e-05, "loss": 0.7633, "num_input_tokens_seen": 3750752, "step": 6505 }, { "epoch": 0.9696157283288651, "grad_norm": 0.8079817891120911, "learning_rate": 4.8473339291033665e-05, "loss": 0.8416, "num_input_tokens_seen": 3753856, "step": 6510 }, { "epoch": 0.9703604408698242, "grad_norm": 0.7044819593429565, "learning_rate": 4.851057491808162e-05, "loss": 0.812, "num_input_tokens_seen": 3756704, "step": 6515 }, { "epoch": 0.9711051534107834, "grad_norm": 0.8154618740081787, "learning_rate": 4.8547810545129585e-05, "loss": 0.842, "num_input_tokens_seen": 3759488, "step": 6520 }, { "epoch": 0.9718498659517426, "grad_norm": 0.7452366352081299, "learning_rate": 4.858504617217754e-05, "loss": 0.8423, "num_input_tokens_seen": 3762272, "step": 6525 }, { "epoch": 0.9725945784927018, "grad_norm": 0.9253728985786438, "learning_rate": 4.86222817992255e-05, "loss": 0.7908, "num_input_tokens_seen": 3764928, "step": 6530 }, { "epoch": 0.973339291033661, "grad_norm": 1.1699377298355103, "learning_rate": 4.865951742627346e-05, "loss": 0.813, "num_input_tokens_seen": 3768192, "step": 6535 }, { "epoch": 0.9740840035746202, "grad_norm": 0.5942056775093079, "learning_rate": 4.869675305332142e-05, "loss": 0.8031, "num_input_tokens_seen": 3771136, "step": 6540 }, { "epoch": 0.9748287161155794, "grad_norm": 0.6748594045639038, "learning_rate": 4.873398868036938e-05, "loss": 0.8246, "num_input_tokens_seen": 3774112, "step": 6545 }, { "epoch": 0.9755734286565386, "grad_norm": 0.7709231972694397, "learning_rate": 4.877122430741734e-05, "loss": 0.8415, "num_input_tokens_seen": 3777024, "step": 6550 }, { "epoch": 0.9763181411974977, "grad_norm": 0.48958081007003784, "learning_rate": 4.88084599344653e-05, "loss": 0.8554, "num_input_tokens_seen": 3779968, "step": 6555 }, { "epoch": 0.9770628537384569, "grad_norm": 0.8348582983016968, "learning_rate": 4.8845695561513264e-05, "loss": 0.8294, "num_input_tokens_seen": 3782752, "step": 6560 }, { "epoch": 0.9778075662794161, "grad_norm": 0.6834572553634644, "learning_rate": 4.8882931188561214e-05, "loss": 0.8362, "num_input_tokens_seen": 3785664, "step": 6565 }, { "epoch": 0.9785522788203753, "grad_norm": 0.6313058733940125, "learning_rate": 4.892016681560918e-05, "loss": 0.8196, "num_input_tokens_seen": 3788224, "step": 6570 }, { "epoch": 0.9792969913613345, "grad_norm": 0.7263492941856384, "learning_rate": 4.895740244265713e-05, "loss": 0.857, "num_input_tokens_seen": 3791136, "step": 6575 }, { "epoch": 0.9800417039022937, "grad_norm": 0.6391903758049011, "learning_rate": 4.8994638069705097e-05, "loss": 0.8632, "num_input_tokens_seen": 3794240, "step": 6580 }, { "epoch": 0.9807864164432529, "grad_norm": 0.5960806012153625, "learning_rate": 4.903187369675305e-05, "loss": 0.8563, "num_input_tokens_seen": 3797088, "step": 6585 }, { "epoch": 0.9815311289842121, "grad_norm": 0.8657112121582031, "learning_rate": 4.9069109323801016e-05, "loss": 0.7807, "num_input_tokens_seen": 3800096, "step": 6590 }, { "epoch": 0.9822758415251713, "grad_norm": 0.7738462090492249, "learning_rate": 4.910634495084897e-05, "loss": 0.8354, "num_input_tokens_seen": 3803072, "step": 6595 }, { "epoch": 0.9830205540661304, "grad_norm": 0.9232980608940125, "learning_rate": 4.9143580577896936e-05, "loss": 0.8095, "num_input_tokens_seen": 3806048, "step": 6600 }, { "epoch": 0.9837652666070896, "grad_norm": 1.0025033950805664, "learning_rate": 4.91808162049449e-05, "loss": 0.8651, "num_input_tokens_seen": 3808768, "step": 6605 }, { "epoch": 0.9845099791480488, "grad_norm": 0.6676769256591797, "learning_rate": 4.921805183199285e-05, "loss": 0.8948, "num_input_tokens_seen": 3811872, "step": 6610 }, { "epoch": 0.985254691689008, "grad_norm": 1.0152019262313843, "learning_rate": 4.925528745904081e-05, "loss": 0.8418, "num_input_tokens_seen": 3814944, "step": 6615 }, { "epoch": 0.9859994042299672, "grad_norm": 0.6587181091308594, "learning_rate": 4.929252308608877e-05, "loss": 0.7991, "num_input_tokens_seen": 3817728, "step": 6620 }, { "epoch": 0.9867441167709264, "grad_norm": 1.1083242893218994, "learning_rate": 4.932975871313673e-05, "loss": 0.8041, "num_input_tokens_seen": 3820512, "step": 6625 }, { "epoch": 0.9874888293118856, "grad_norm": 0.7053787112236023, "learning_rate": 4.936699434018469e-05, "loss": 0.8533, "num_input_tokens_seen": 3823712, "step": 6630 }, { "epoch": 0.9882335418528448, "grad_norm": 0.8268150687217712, "learning_rate": 4.940422996723265e-05, "loss": 0.8302, "num_input_tokens_seen": 3826336, "step": 6635 }, { "epoch": 0.988978254393804, "grad_norm": 0.9008312225341797, "learning_rate": 4.9441465594280615e-05, "loss": 0.7579, "num_input_tokens_seen": 3828992, "step": 6640 }, { "epoch": 0.9897229669347631, "grad_norm": 0.6423450708389282, "learning_rate": 4.947870122132857e-05, "loss": 0.7844, "num_input_tokens_seen": 3831712, "step": 6645 }, { "epoch": 0.9904676794757223, "grad_norm": 0.6434794664382935, "learning_rate": 4.9515936848376534e-05, "loss": 0.8161, "num_input_tokens_seen": 3834304, "step": 6650 }, { "epoch": 0.9912123920166815, "grad_norm": 0.7450922727584839, "learning_rate": 4.9553172475424484e-05, "loss": 0.8047, "num_input_tokens_seen": 3837056, "step": 6655 }, { "epoch": 0.9919571045576407, "grad_norm": 0.7261887788772583, "learning_rate": 4.959040810247245e-05, "loss": 0.8216, "num_input_tokens_seen": 3840000, "step": 6660 }, { "epoch": 0.9927018170986, "grad_norm": 0.9718079566955566, "learning_rate": 4.9627643729520404e-05, "loss": 0.7947, "num_input_tokens_seen": 3842848, "step": 6665 }, { "epoch": 0.9934465296395592, "grad_norm": 0.8796062469482422, "learning_rate": 4.966487935656837e-05, "loss": 0.8497, "num_input_tokens_seen": 3845600, "step": 6670 }, { "epoch": 0.9941912421805184, "grad_norm": 0.7989456057548523, "learning_rate": 4.9702114983616324e-05, "loss": 0.7934, "num_input_tokens_seen": 3848704, "step": 6675 }, { "epoch": 0.9949359547214776, "grad_norm": 0.9974679350852966, "learning_rate": 4.973935061066429e-05, "loss": 0.8775, "num_input_tokens_seen": 3851680, "step": 6680 }, { "epoch": 0.9956806672624366, "grad_norm": 0.7276226878166199, "learning_rate": 4.977658623771225e-05, "loss": 0.8433, "num_input_tokens_seen": 3854400, "step": 6685 }, { "epoch": 0.9964253798033958, "grad_norm": 0.7543637752532959, "learning_rate": 4.9813821864760207e-05, "loss": 0.8475, "num_input_tokens_seen": 3857632, "step": 6690 }, { "epoch": 0.997170092344355, "grad_norm": 0.6505950093269348, "learning_rate": 4.985105749180816e-05, "loss": 0.7931, "num_input_tokens_seen": 3860192, "step": 6695 }, { "epoch": 0.9979148048853143, "grad_norm": 0.6656701564788818, "learning_rate": 4.988829311885612e-05, "loss": 0.7894, "num_input_tokens_seen": 3862944, "step": 6700 }, { "epoch": 0.9986595174262735, "grad_norm": 0.9770694971084595, "learning_rate": 4.992552874590408e-05, "loss": 0.8261, "num_input_tokens_seen": 3865856, "step": 6705 }, { "epoch": 0.9994042299672327, "grad_norm": 0.6572821140289307, "learning_rate": 4.996276437295204e-05, "loss": 0.8336, "num_input_tokens_seen": 3868704, "step": 6710 }, { "epoch": 1.0, "eval_loss": 0.8211696147918701, "eval_runtime": 45.4421, "eval_samples_per_second": 65.666, "eval_steps_per_second": 16.417, "num_input_tokens_seen": 3870688, "step": 6714 }, { "epoch": 1.0001489425081918, "grad_norm": 0.8652033805847168, "learning_rate": 5e-05, "loss": 0.8327, "num_input_tokens_seen": 3871200, "step": 6715 }, { "epoch": 1.000893655049151, "grad_norm": 0.6300815939903259, "learning_rate": 4.999999915530083e-05, "loss": 0.8392, "num_input_tokens_seen": 3873984, "step": 6720 }, { "epoch": 1.0016383675901102, "grad_norm": 0.5630407333374023, "learning_rate": 4.999999662120335e-05, "loss": 0.8276, "num_input_tokens_seen": 3876960, "step": 6725 }, { "epoch": 1.0023830801310694, "grad_norm": 1.1360963582992554, "learning_rate": 4.999999239770774e-05, "loss": 0.8277, "num_input_tokens_seen": 3879968, "step": 6730 }, { "epoch": 1.0031277926720286, "grad_norm": 0.8354423642158508, "learning_rate": 4.999998648481429e-05, "loss": 0.7936, "num_input_tokens_seen": 3882816, "step": 6735 }, { "epoch": 1.0038725052129878, "grad_norm": 0.8296881914138794, "learning_rate": 4.999997888252339e-05, "loss": 0.867, "num_input_tokens_seen": 3885760, "step": 6740 }, { "epoch": 1.004617217753947, "grad_norm": 0.9843464493751526, "learning_rate": 4.999996959083556e-05, "loss": 0.8737, "num_input_tokens_seen": 3888576, "step": 6745 }, { "epoch": 1.0053619302949062, "grad_norm": 0.7320095896720886, "learning_rate": 4.999995860975143e-05, "loss": 0.8412, "num_input_tokens_seen": 3891360, "step": 6750 }, { "epoch": 1.0061066428358654, "grad_norm": 0.8303598165512085, "learning_rate": 4.9999945939271744e-05, "loss": 0.8297, "num_input_tokens_seen": 3894208, "step": 6755 }, { "epoch": 1.0068513553768246, "grad_norm": 0.655915379524231, "learning_rate": 4.999993157939735e-05, "loss": 0.7927, "num_input_tokens_seen": 3896928, "step": 6760 }, { "epoch": 1.0075960679177838, "grad_norm": 0.8200067281723022, "learning_rate": 4.999991553012923e-05, "loss": 0.8187, "num_input_tokens_seen": 3899968, "step": 6765 }, { "epoch": 1.008340780458743, "grad_norm": 0.9544994831085205, "learning_rate": 4.999989779146845e-05, "loss": 0.825, "num_input_tokens_seen": 3902688, "step": 6770 }, { "epoch": 1.0090854929997022, "grad_norm": 0.6536201238632202, "learning_rate": 4.999987836341622e-05, "loss": 0.8368, "num_input_tokens_seen": 3905504, "step": 6775 }, { "epoch": 1.0098302055406614, "grad_norm": 0.8888244032859802, "learning_rate": 4.999985724597386e-05, "loss": 0.8187, "num_input_tokens_seen": 3908480, "step": 6780 }, { "epoch": 1.0105749180816206, "grad_norm": 0.7003964185714722, "learning_rate": 4.9999834439142776e-05, "loss": 0.7735, "num_input_tokens_seen": 3911168, "step": 6785 }, { "epoch": 1.0113196306225798, "grad_norm": 1.0617443323135376, "learning_rate": 4.999980994292454e-05, "loss": 0.865, "num_input_tokens_seen": 3913888, "step": 6790 }, { "epoch": 1.0120643431635388, "grad_norm": 0.9314053654670715, "learning_rate": 4.9999783757320776e-05, "loss": 0.834, "num_input_tokens_seen": 3916768, "step": 6795 }, { "epoch": 1.012809055704498, "grad_norm": 0.7283264398574829, "learning_rate": 4.9999755882333275e-05, "loss": 0.8192, "num_input_tokens_seen": 3919552, "step": 6800 }, { "epoch": 1.0135537682454572, "grad_norm": 0.891809344291687, "learning_rate": 4.999972631796391e-05, "loss": 0.8312, "num_input_tokens_seen": 3922368, "step": 6805 }, { "epoch": 1.0142984807864164, "grad_norm": 0.7467648386955261, "learning_rate": 4.999969506421468e-05, "loss": 0.7979, "num_input_tokens_seen": 3925216, "step": 6810 }, { "epoch": 1.0150431933273756, "grad_norm": 0.7966206073760986, "learning_rate": 4.99996621210877e-05, "loss": 0.8274, "num_input_tokens_seen": 3928416, "step": 6815 }, { "epoch": 1.0157879058683348, "grad_norm": 0.854318380355835, "learning_rate": 4.99996274885852e-05, "loss": 0.7635, "num_input_tokens_seen": 3931584, "step": 6820 }, { "epoch": 1.016532618409294, "grad_norm": 0.6846252083778381, "learning_rate": 4.999959116670951e-05, "loss": 0.7926, "num_input_tokens_seen": 3934464, "step": 6825 }, { "epoch": 1.0172773309502532, "grad_norm": 0.8125613927841187, "learning_rate": 4.999955315546309e-05, "loss": 0.8639, "num_input_tokens_seen": 3937440, "step": 6830 }, { "epoch": 1.0180220434912124, "grad_norm": 0.835975706577301, "learning_rate": 4.999951345484851e-05, "loss": 0.8462, "num_input_tokens_seen": 3940224, "step": 6835 }, { "epoch": 1.0187667560321716, "grad_norm": 0.7059467434883118, "learning_rate": 4.999947206486846e-05, "loss": 0.7985, "num_input_tokens_seen": 3943040, "step": 6840 }, { "epoch": 1.0195114685731308, "grad_norm": 0.8861627578735352, "learning_rate": 4.999942898552571e-05, "loss": 0.8488, "num_input_tokens_seen": 3945792, "step": 6845 }, { "epoch": 1.02025618111409, "grad_norm": 0.7149626016616821, "learning_rate": 4.99993842168232e-05, "loss": 0.8307, "num_input_tokens_seen": 3948320, "step": 6850 }, { "epoch": 1.0210008936550492, "grad_norm": 0.5731320977210999, "learning_rate": 4.999933775876395e-05, "loss": 0.8359, "num_input_tokens_seen": 3951232, "step": 6855 }, { "epoch": 1.0217456061960084, "grad_norm": 0.7406325340270996, "learning_rate": 4.999928961135109e-05, "loss": 0.8267, "num_input_tokens_seen": 3954080, "step": 6860 }, { "epoch": 1.0224903187369676, "grad_norm": 0.7011671662330627, "learning_rate": 4.9999239774587867e-05, "loss": 0.7985, "num_input_tokens_seen": 3956768, "step": 6865 }, { "epoch": 1.0232350312779268, "grad_norm": 0.8579574823379517, "learning_rate": 4.999918824847767e-05, "loss": 0.8354, "num_input_tokens_seen": 3959712, "step": 6870 }, { "epoch": 1.023979743818886, "grad_norm": 0.687664806842804, "learning_rate": 4.999913503302397e-05, "loss": 0.812, "num_input_tokens_seen": 3962336, "step": 6875 }, { "epoch": 1.024724456359845, "grad_norm": 0.6635074615478516, "learning_rate": 4.9999080128230365e-05, "loss": 0.7932, "num_input_tokens_seen": 3965280, "step": 6880 }, { "epoch": 1.0254691689008042, "grad_norm": 0.7294871211051941, "learning_rate": 4.9999023534100565e-05, "loss": 0.7915, "num_input_tokens_seen": 3968512, "step": 6885 }, { "epoch": 1.0262138814417634, "grad_norm": 0.8353764414787292, "learning_rate": 4.999896525063839e-05, "loss": 0.8414, "num_input_tokens_seen": 3971488, "step": 6890 }, { "epoch": 1.0269585939827226, "grad_norm": 0.7815722227096558, "learning_rate": 4.999890527784777e-05, "loss": 0.8402, "num_input_tokens_seen": 3974560, "step": 6895 }, { "epoch": 1.0277033065236818, "grad_norm": 0.8280229568481445, "learning_rate": 4.999884361573279e-05, "loss": 0.7794, "num_input_tokens_seen": 3977600, "step": 6900 }, { "epoch": 1.028448019064641, "grad_norm": 0.9982237815856934, "learning_rate": 4.999878026429758e-05, "loss": 0.8608, "num_input_tokens_seen": 3980864, "step": 6905 }, { "epoch": 1.0291927316056002, "grad_norm": 0.8530614972114563, "learning_rate": 4.999871522354645e-05, "loss": 0.8109, "num_input_tokens_seen": 3983936, "step": 6910 }, { "epoch": 1.0299374441465594, "grad_norm": 0.8406744599342346, "learning_rate": 4.999864849348378e-05, "loss": 0.8102, "num_input_tokens_seen": 3986784, "step": 6915 }, { "epoch": 1.0306821566875186, "grad_norm": 0.9328622817993164, "learning_rate": 4.999858007411408e-05, "loss": 0.7957, "num_input_tokens_seen": 3989408, "step": 6920 }, { "epoch": 1.0314268692284778, "grad_norm": 0.6359943747520447, "learning_rate": 4.999850996544197e-05, "loss": 0.8169, "num_input_tokens_seen": 3992384, "step": 6925 }, { "epoch": 1.032171581769437, "grad_norm": 0.7491590976715088, "learning_rate": 4.999843816747219e-05, "loss": 0.8559, "num_input_tokens_seen": 3994816, "step": 6930 }, { "epoch": 1.0329162943103962, "grad_norm": 0.6043179035186768, "learning_rate": 4.9998364680209605e-05, "loss": 0.8546, "num_input_tokens_seen": 3997472, "step": 6935 }, { "epoch": 1.0336610068513554, "grad_norm": 0.7010362148284912, "learning_rate": 4.999828950365917e-05, "loss": 0.7976, "num_input_tokens_seen": 4000352, "step": 6940 }, { "epoch": 1.0344057193923146, "grad_norm": 0.5954385995864868, "learning_rate": 4.999821263782597e-05, "loss": 0.829, "num_input_tokens_seen": 4003168, "step": 6945 }, { "epoch": 1.0351504319332738, "grad_norm": 0.6777523159980774, "learning_rate": 4.9998134082715184e-05, "loss": 0.8324, "num_input_tokens_seen": 4006016, "step": 6950 }, { "epoch": 1.035895144474233, "grad_norm": 0.9024150371551514, "learning_rate": 4.999805383833214e-05, "loss": 0.8236, "num_input_tokens_seen": 4008992, "step": 6955 }, { "epoch": 1.0366398570151922, "grad_norm": 0.6043156385421753, "learning_rate": 4.999797190468225e-05, "loss": 0.8726, "num_input_tokens_seen": 4011552, "step": 6960 }, { "epoch": 1.0373845695561514, "grad_norm": 0.683912992477417, "learning_rate": 4.999788828177105e-05, "loss": 0.7627, "num_input_tokens_seen": 4014304, "step": 6965 }, { "epoch": 1.0381292820971104, "grad_norm": 0.7627355456352234, "learning_rate": 4.9997802969604195e-05, "loss": 0.8304, "num_input_tokens_seen": 4017376, "step": 6970 }, { "epoch": 1.0388739946380696, "grad_norm": 0.503376841545105, "learning_rate": 4.9997715968187456e-05, "loss": 0.8506, "num_input_tokens_seen": 4020032, "step": 6975 }, { "epoch": 1.0396187071790288, "grad_norm": 0.6415961980819702, "learning_rate": 4.999762727752669e-05, "loss": 0.8139, "num_input_tokens_seen": 4022912, "step": 6980 }, { "epoch": 1.040363419719988, "grad_norm": 0.7324686050415039, "learning_rate": 4.9997536897627915e-05, "loss": 0.8051, "num_input_tokens_seen": 4025632, "step": 6985 }, { "epoch": 1.0411081322609472, "grad_norm": 0.7227103114128113, "learning_rate": 4.999744482849723e-05, "loss": 0.8003, "num_input_tokens_seen": 4028448, "step": 6990 }, { "epoch": 1.0418528448019064, "grad_norm": 0.7029945850372314, "learning_rate": 4.9997351070140856e-05, "loss": 0.8171, "num_input_tokens_seen": 4031424, "step": 6995 }, { "epoch": 1.0425975573428656, "grad_norm": 0.9337041974067688, "learning_rate": 4.999725562256513e-05, "loss": 0.8758, "num_input_tokens_seen": 4034400, "step": 7000 }, { "epoch": 1.0433422698838248, "grad_norm": 0.6819624900817871, "learning_rate": 4.9997158485776493e-05, "loss": 0.8246, "num_input_tokens_seen": 4037440, "step": 7005 }, { "epoch": 1.044086982424784, "grad_norm": 0.9079103469848633, "learning_rate": 4.9997059659781526e-05, "loss": 0.8324, "num_input_tokens_seen": 4040096, "step": 7010 }, { "epoch": 1.0448316949657432, "grad_norm": 0.7661221027374268, "learning_rate": 4.9996959144586895e-05, "loss": 0.8161, "num_input_tokens_seen": 4042944, "step": 7015 }, { "epoch": 1.0455764075067024, "grad_norm": 0.7216536998748779, "learning_rate": 4.999685694019939e-05, "loss": 0.8458, "num_input_tokens_seen": 4045568, "step": 7020 }, { "epoch": 1.0463211200476616, "grad_norm": 0.6483311653137207, "learning_rate": 4.9996753046625925e-05, "loss": 0.7906, "num_input_tokens_seen": 4048288, "step": 7025 }, { "epoch": 1.0470658325886208, "grad_norm": 0.6324891448020935, "learning_rate": 4.9996647463873525e-05, "loss": 0.7487, "num_input_tokens_seen": 4051168, "step": 7030 }, { "epoch": 1.04781054512958, "grad_norm": 1.00623619556427, "learning_rate": 4.999654019194931e-05, "loss": 0.772, "num_input_tokens_seen": 4053728, "step": 7035 }, { "epoch": 1.0485552576705393, "grad_norm": 0.9249163269996643, "learning_rate": 4.9996431230860544e-05, "loss": 0.8234, "num_input_tokens_seen": 4056768, "step": 7040 }, { "epoch": 1.0492999702114985, "grad_norm": 0.8245049118995667, "learning_rate": 4.999632058061457e-05, "loss": 0.8087, "num_input_tokens_seen": 4059648, "step": 7045 }, { "epoch": 1.0500446827524577, "grad_norm": 0.5520861744880676, "learning_rate": 4.999620824121889e-05, "loss": 0.8699, "num_input_tokens_seen": 4062208, "step": 7050 }, { "epoch": 1.0507893952934166, "grad_norm": 0.8771433234214783, "learning_rate": 4.999609421268109e-05, "loss": 0.8116, "num_input_tokens_seen": 4064960, "step": 7055 }, { "epoch": 1.0515341078343758, "grad_norm": 0.4840909242630005, "learning_rate": 4.999597849500886e-05, "loss": 0.8222, "num_input_tokens_seen": 4067680, "step": 7060 }, { "epoch": 1.052278820375335, "grad_norm": 0.8800010085105896, "learning_rate": 4.999586108821003e-05, "loss": 0.8099, "num_input_tokens_seen": 4070464, "step": 7065 }, { "epoch": 1.0530235329162942, "grad_norm": 0.5613735914230347, "learning_rate": 4.999574199229254e-05, "loss": 0.7895, "num_input_tokens_seen": 4073408, "step": 7070 }, { "epoch": 1.0537682454572534, "grad_norm": 0.6222699880599976, "learning_rate": 4.9995621207264426e-05, "loss": 0.7495, "num_input_tokens_seen": 4076160, "step": 7075 }, { "epoch": 1.0545129579982127, "grad_norm": 0.8933428525924683, "learning_rate": 4.9995498733133864e-05, "loss": 0.8354, "num_input_tokens_seen": 4079168, "step": 7080 }, { "epoch": 1.0552576705391719, "grad_norm": 0.8871914744377136, "learning_rate": 4.9995374569909105e-05, "loss": 0.8251, "num_input_tokens_seen": 4082208, "step": 7085 }, { "epoch": 1.056002383080131, "grad_norm": 0.6519229412078857, "learning_rate": 4.999524871759857e-05, "loss": 0.8108, "num_input_tokens_seen": 4085024, "step": 7090 }, { "epoch": 1.0567470956210903, "grad_norm": 0.8059476613998413, "learning_rate": 4.999512117621075e-05, "loss": 0.8818, "num_input_tokens_seen": 4087968, "step": 7095 }, { "epoch": 1.0574918081620495, "grad_norm": 0.7428897023200989, "learning_rate": 4.999499194575426e-05, "loss": 0.7819, "num_input_tokens_seen": 4091008, "step": 7100 }, { "epoch": 1.0582365207030087, "grad_norm": 0.7783142924308777, "learning_rate": 4.9994861026237826e-05, "loss": 0.8463, "num_input_tokens_seen": 4094080, "step": 7105 }, { "epoch": 1.0589812332439679, "grad_norm": 0.7016613483428955, "learning_rate": 4.999472841767032e-05, "loss": 0.8277, "num_input_tokens_seen": 4096672, "step": 7110 }, { "epoch": 1.059725945784927, "grad_norm": 1.0484604835510254, "learning_rate": 4.999459412006069e-05, "loss": 0.7959, "num_input_tokens_seen": 4099424, "step": 7115 }, { "epoch": 1.0604706583258863, "grad_norm": 0.8231940269470215, "learning_rate": 4.9994458133418e-05, "loss": 0.8528, "num_input_tokens_seen": 4102752, "step": 7120 }, { "epoch": 1.0612153708668455, "grad_norm": 0.769527792930603, "learning_rate": 4.9994320457751456e-05, "loss": 0.8377, "num_input_tokens_seen": 4105792, "step": 7125 }, { "epoch": 1.0619600834078047, "grad_norm": 0.942209005355835, "learning_rate": 4.9994181093070345e-05, "loss": 0.8635, "num_input_tokens_seen": 4109152, "step": 7130 }, { "epoch": 1.0627047959487639, "grad_norm": 0.6738665699958801, "learning_rate": 4.9994040039384104e-05, "loss": 0.8578, "num_input_tokens_seen": 4112192, "step": 7135 }, { "epoch": 1.063449508489723, "grad_norm": 0.7831280827522278, "learning_rate": 4.999389729670226e-05, "loss": 0.8079, "num_input_tokens_seen": 4115072, "step": 7140 }, { "epoch": 1.064194221030682, "grad_norm": 0.739861249923706, "learning_rate": 4.999375286503445e-05, "loss": 0.8064, "num_input_tokens_seen": 4117760, "step": 7145 }, { "epoch": 1.0649389335716413, "grad_norm": 0.8364281058311462, "learning_rate": 4.999360674439043e-05, "loss": 0.8068, "num_input_tokens_seen": 4120704, "step": 7150 }, { "epoch": 1.0656836461126005, "grad_norm": 0.5074242949485779, "learning_rate": 4.999345893478009e-05, "loss": 0.7877, "num_input_tokens_seen": 4123712, "step": 7155 }, { "epoch": 1.0664283586535597, "grad_norm": 0.5728082060813904, "learning_rate": 4.9993309436213415e-05, "loss": 0.8426, "num_input_tokens_seen": 4126528, "step": 7160 }, { "epoch": 1.0671730711945189, "grad_norm": 0.818901002407074, "learning_rate": 4.99931582487005e-05, "loss": 0.8053, "num_input_tokens_seen": 4129312, "step": 7165 }, { "epoch": 1.067917783735478, "grad_norm": 0.7170857787132263, "learning_rate": 4.999300537225157e-05, "loss": 0.8125, "num_input_tokens_seen": 4132192, "step": 7170 }, { "epoch": 1.0686624962764373, "grad_norm": 0.930282711982727, "learning_rate": 4.999285080687694e-05, "loss": 0.8502, "num_input_tokens_seen": 4134848, "step": 7175 }, { "epoch": 1.0694072088173965, "grad_norm": 0.7622508406639099, "learning_rate": 4.999269455258707e-05, "loss": 0.8666, "num_input_tokens_seen": 4137952, "step": 7180 }, { "epoch": 1.0701519213583557, "grad_norm": 1.0597282648086548, "learning_rate": 4.999253660939251e-05, "loss": 0.8365, "num_input_tokens_seen": 4140896, "step": 7185 }, { "epoch": 1.0708966338993149, "grad_norm": 0.8875032663345337, "learning_rate": 4.999237697730396e-05, "loss": 0.8024, "num_input_tokens_seen": 4143584, "step": 7190 }, { "epoch": 1.071641346440274, "grad_norm": 0.8417638540267944, "learning_rate": 4.9992215656332166e-05, "loss": 0.8196, "num_input_tokens_seen": 4146496, "step": 7195 }, { "epoch": 1.0723860589812333, "grad_norm": 0.8037329316139221, "learning_rate": 4.999205264648805e-05, "loss": 0.7866, "num_input_tokens_seen": 4149728, "step": 7200 }, { "epoch": 1.0731307715221925, "grad_norm": 0.5472152829170227, "learning_rate": 4.999188794778263e-05, "loss": 0.8618, "num_input_tokens_seen": 4152480, "step": 7205 }, { "epoch": 1.0738754840631517, "grad_norm": 0.5655068755149841, "learning_rate": 4.999172156022703e-05, "loss": 0.7401, "num_input_tokens_seen": 4155680, "step": 7210 }, { "epoch": 1.074620196604111, "grad_norm": 0.6881135702133179, "learning_rate": 4.9991553483832506e-05, "loss": 0.8151, "num_input_tokens_seen": 4158272, "step": 7215 }, { "epoch": 1.07536490914507, "grad_norm": 0.5183091163635254, "learning_rate": 4.9991383718610397e-05, "loss": 0.795, "num_input_tokens_seen": 4160960, "step": 7220 }, { "epoch": 1.076109621686029, "grad_norm": 0.8193150758743286, "learning_rate": 4.999121226457219e-05, "loss": 0.7767, "num_input_tokens_seen": 4164032, "step": 7225 }, { "epoch": 1.0768543342269883, "grad_norm": 0.8928486108779907, "learning_rate": 4.999103912172945e-05, "loss": 0.8272, "num_input_tokens_seen": 4167040, "step": 7230 }, { "epoch": 1.0775990467679475, "grad_norm": 0.6772527098655701, "learning_rate": 4.999086429009391e-05, "loss": 0.8047, "num_input_tokens_seen": 4169888, "step": 7235 }, { "epoch": 1.0783437593089067, "grad_norm": 0.6131108999252319, "learning_rate": 4.999068776967736e-05, "loss": 0.8224, "num_input_tokens_seen": 4172768, "step": 7240 }, { "epoch": 1.079088471849866, "grad_norm": 0.4784959852695465, "learning_rate": 4.999050956049173e-05, "loss": 0.8025, "num_input_tokens_seen": 4175552, "step": 7245 }, { "epoch": 1.079833184390825, "grad_norm": 0.9011492133140564, "learning_rate": 4.999032966254907e-05, "loss": 0.8763, "num_input_tokens_seen": 4178528, "step": 7250 }, { "epoch": 1.0805778969317843, "grad_norm": 0.7262475490570068, "learning_rate": 4.999014807586154e-05, "loss": 0.8485, "num_input_tokens_seen": 4181376, "step": 7255 }, { "epoch": 1.0813226094727435, "grad_norm": 0.8417482376098633, "learning_rate": 4.99899648004414e-05, "loss": 0.7874, "num_input_tokens_seen": 4184160, "step": 7260 }, { "epoch": 1.0820673220137027, "grad_norm": 0.5955368280410767, "learning_rate": 4.998977983630104e-05, "loss": 0.8253, "num_input_tokens_seen": 4186784, "step": 7265 }, { "epoch": 1.082812034554662, "grad_norm": 0.7838757634162903, "learning_rate": 4.9989593183452965e-05, "loss": 0.7754, "num_input_tokens_seen": 4189472, "step": 7270 }, { "epoch": 1.083556747095621, "grad_norm": 0.6535812020301819, "learning_rate": 4.9989404841909784e-05, "loss": 0.8554, "num_input_tokens_seen": 4192768, "step": 7275 }, { "epoch": 1.0843014596365803, "grad_norm": 0.6535593867301941, "learning_rate": 4.998921481168421e-05, "loss": 0.7632, "num_input_tokens_seen": 4195488, "step": 7280 }, { "epoch": 1.0850461721775395, "grad_norm": 0.720156192779541, "learning_rate": 4.9989023092789113e-05, "loss": 0.8388, "num_input_tokens_seen": 4198528, "step": 7285 }, { "epoch": 1.0857908847184987, "grad_norm": 0.5953981280326843, "learning_rate": 4.998882968523743e-05, "loss": 0.84, "num_input_tokens_seen": 4201312, "step": 7290 }, { "epoch": 1.086535597259458, "grad_norm": 0.9715330004692078, "learning_rate": 4.9988634589042227e-05, "loss": 0.7965, "num_input_tokens_seen": 4204032, "step": 7295 }, { "epoch": 1.0872803098004171, "grad_norm": 1.0123937129974365, "learning_rate": 4.9988437804216704e-05, "loss": 0.8862, "num_input_tokens_seen": 4207200, "step": 7300 }, { "epoch": 1.0880250223413763, "grad_norm": 0.7810103297233582, "learning_rate": 4.998823933077414e-05, "loss": 0.8277, "num_input_tokens_seen": 4210592, "step": 7305 }, { "epoch": 1.0887697348823355, "grad_norm": 0.7943045496940613, "learning_rate": 4.998803916872797e-05, "loss": 0.8341, "num_input_tokens_seen": 4213888, "step": 7310 }, { "epoch": 1.0895144474232945, "grad_norm": 0.5883284211158752, "learning_rate": 4.99878373180917e-05, "loss": 0.806, "num_input_tokens_seen": 4216544, "step": 7315 }, { "epoch": 1.0902591599642537, "grad_norm": 0.6123392581939697, "learning_rate": 4.9987633778878975e-05, "loss": 0.8013, "num_input_tokens_seen": 4219328, "step": 7320 }, { "epoch": 1.091003872505213, "grad_norm": 0.6244713664054871, "learning_rate": 4.9987428551103554e-05, "loss": 0.7989, "num_input_tokens_seen": 4222144, "step": 7325 }, { "epoch": 1.0917485850461721, "grad_norm": 0.6593179702758789, "learning_rate": 4.9987221634779303e-05, "loss": 0.806, "num_input_tokens_seen": 4225056, "step": 7330 }, { "epoch": 1.0924932975871313, "grad_norm": 0.7183020710945129, "learning_rate": 4.99870130299202e-05, "loss": 0.7689, "num_input_tokens_seen": 4228160, "step": 7335 }, { "epoch": 1.0932380101280905, "grad_norm": 0.6375951170921326, "learning_rate": 4.998680273654035e-05, "loss": 0.8585, "num_input_tokens_seen": 4230976, "step": 7340 }, { "epoch": 1.0939827226690497, "grad_norm": 0.7225058078765869, "learning_rate": 4.998659075465396e-05, "loss": 0.8399, "num_input_tokens_seen": 4233792, "step": 7345 }, { "epoch": 1.094727435210009, "grad_norm": 0.7162731289863586, "learning_rate": 4.998637708427536e-05, "loss": 0.8152, "num_input_tokens_seen": 4236992, "step": 7350 }, { "epoch": 1.0954721477509681, "grad_norm": 0.5537922978401184, "learning_rate": 4.998616172541898e-05, "loss": 0.8218, "num_input_tokens_seen": 4239712, "step": 7355 }, { "epoch": 1.0962168602919273, "grad_norm": 0.6904425621032715, "learning_rate": 4.9985944678099374e-05, "loss": 0.8383, "num_input_tokens_seen": 4242464, "step": 7360 }, { "epoch": 1.0969615728328865, "grad_norm": 0.8472482562065125, "learning_rate": 4.998572594233121e-05, "loss": 0.8037, "num_input_tokens_seen": 4245408, "step": 7365 }, { "epoch": 1.0977062853738457, "grad_norm": 0.8939396739006042, "learning_rate": 4.998550551812927e-05, "loss": 0.8207, "num_input_tokens_seen": 4248416, "step": 7370 }, { "epoch": 1.098450997914805, "grad_norm": 0.6707898378372192, "learning_rate": 4.998528340550846e-05, "loss": 0.8286, "num_input_tokens_seen": 4251584, "step": 7375 }, { "epoch": 1.0991957104557641, "grad_norm": 0.7448156476020813, "learning_rate": 4.998505960448377e-05, "loss": 0.8004, "num_input_tokens_seen": 4254528, "step": 7380 }, { "epoch": 1.0999404229967233, "grad_norm": 0.8242465257644653, "learning_rate": 4.998483411507034e-05, "loss": 0.8067, "num_input_tokens_seen": 4257376, "step": 7385 }, { "epoch": 1.1006851355376825, "grad_norm": 1.0511747598648071, "learning_rate": 4.9984606937283405e-05, "loss": 0.7892, "num_input_tokens_seen": 4260288, "step": 7390 }, { "epoch": 1.1014298480786415, "grad_norm": 0.8103150725364685, "learning_rate": 4.9984378071138315e-05, "loss": 0.8073, "num_input_tokens_seen": 4263200, "step": 7395 }, { "epoch": 1.1021745606196007, "grad_norm": 0.6382284760475159, "learning_rate": 4.998414751665053e-05, "loss": 0.7992, "num_input_tokens_seen": 4266048, "step": 7400 }, { "epoch": 1.10291927316056, "grad_norm": 0.7502655386924744, "learning_rate": 4.998391527383563e-05, "loss": 0.8188, "num_input_tokens_seen": 4269088, "step": 7405 }, { "epoch": 1.1036639857015191, "grad_norm": 0.6582499146461487, "learning_rate": 4.9983681342709316e-05, "loss": 0.7858, "num_input_tokens_seen": 4272192, "step": 7410 }, { "epoch": 1.1044086982424783, "grad_norm": 0.7494354248046875, "learning_rate": 4.998344572328739e-05, "loss": 0.8525, "num_input_tokens_seen": 4275200, "step": 7415 }, { "epoch": 1.1051534107834375, "grad_norm": 0.7875756621360779, "learning_rate": 4.998320841558578e-05, "loss": 0.7819, "num_input_tokens_seen": 4278368, "step": 7420 }, { "epoch": 1.1058981233243967, "grad_norm": 0.6705533862113953, "learning_rate": 4.9982969419620516e-05, "loss": 0.8536, "num_input_tokens_seen": 4281152, "step": 7425 }, { "epoch": 1.106642835865356, "grad_norm": 0.6778672337532043, "learning_rate": 4.9982728735407756e-05, "loss": 0.8554, "num_input_tokens_seen": 4283840, "step": 7430 }, { "epoch": 1.1073875484063151, "grad_norm": 0.6616614460945129, "learning_rate": 4.998248636296377e-05, "loss": 0.8868, "num_input_tokens_seen": 4286432, "step": 7435 }, { "epoch": 1.1081322609472744, "grad_norm": 0.6577677726745605, "learning_rate": 4.998224230230491e-05, "loss": 0.7715, "num_input_tokens_seen": 4289088, "step": 7440 }, { "epoch": 1.1088769734882336, "grad_norm": 0.8832501769065857, "learning_rate": 4.9981996553447695e-05, "loss": 0.8299, "num_input_tokens_seen": 4291712, "step": 7445 }, { "epoch": 1.1096216860291928, "grad_norm": 0.6699954271316528, "learning_rate": 4.998174911640872e-05, "loss": 0.8369, "num_input_tokens_seen": 4294432, "step": 7450 }, { "epoch": 1.110366398570152, "grad_norm": 0.633173406124115, "learning_rate": 4.9981499991204704e-05, "loss": 0.8326, "num_input_tokens_seen": 4297184, "step": 7455 }, { "epoch": 1.1111111111111112, "grad_norm": 0.9308719038963318, "learning_rate": 4.998124917785249e-05, "loss": 0.7792, "num_input_tokens_seen": 4299904, "step": 7460 }, { "epoch": 1.1118558236520704, "grad_norm": 0.6847658157348633, "learning_rate": 4.9980996676369026e-05, "loss": 0.7975, "num_input_tokens_seen": 4303008, "step": 7465 }, { "epoch": 1.1126005361930296, "grad_norm": 0.6243714094161987, "learning_rate": 4.998074248677137e-05, "loss": 0.7595, "num_input_tokens_seen": 4305888, "step": 7470 }, { "epoch": 1.1133452487339888, "grad_norm": 0.8164258599281311, "learning_rate": 4.9980486609076695e-05, "loss": 0.8571, "num_input_tokens_seen": 4309024, "step": 7475 }, { "epoch": 1.114089961274948, "grad_norm": 0.7320423126220703, "learning_rate": 4.998022904330231e-05, "loss": 0.8193, "num_input_tokens_seen": 4311936, "step": 7480 }, { "epoch": 1.1148346738159072, "grad_norm": 0.7413620352745056, "learning_rate": 4.9979969789465594e-05, "loss": 0.8435, "num_input_tokens_seen": 4314784, "step": 7485 }, { "epoch": 1.1155793863568662, "grad_norm": 0.7819477319717407, "learning_rate": 4.9979708847584095e-05, "loss": 0.8579, "num_input_tokens_seen": 4317792, "step": 7490 }, { "epoch": 1.1163240988978254, "grad_norm": 0.6846716403961182, "learning_rate": 4.9979446217675416e-05, "loss": 0.8077, "num_input_tokens_seen": 4320768, "step": 7495 }, { "epoch": 1.1170688114387846, "grad_norm": 0.7605370283126831, "learning_rate": 4.997918189975733e-05, "loss": 0.8435, "num_input_tokens_seen": 4323456, "step": 7500 }, { "epoch": 1.1178135239797438, "grad_norm": 0.46673959493637085, "learning_rate": 4.99789158938477e-05, "loss": 0.8112, "num_input_tokens_seen": 4326432, "step": 7505 }, { "epoch": 1.118558236520703, "grad_norm": 0.9719295501708984, "learning_rate": 4.9978648199964476e-05, "loss": 0.8801, "num_input_tokens_seen": 4329504, "step": 7510 }, { "epoch": 1.1193029490616622, "grad_norm": 0.6073809266090393, "learning_rate": 4.997837881812577e-05, "loss": 0.8175, "num_input_tokens_seen": 4332576, "step": 7515 }, { "epoch": 1.1200476616026214, "grad_norm": 0.7401876449584961, "learning_rate": 4.997810774834977e-05, "loss": 0.7984, "num_input_tokens_seen": 4335520, "step": 7520 }, { "epoch": 1.1207923741435806, "grad_norm": 0.8547123074531555, "learning_rate": 4.9977834990654804e-05, "loss": 0.8076, "num_input_tokens_seen": 4338400, "step": 7525 }, { "epoch": 1.1215370866845398, "grad_norm": 0.8408424258232117, "learning_rate": 4.997756054505931e-05, "loss": 0.8358, "num_input_tokens_seen": 4341280, "step": 7530 }, { "epoch": 1.122281799225499, "grad_norm": 0.7861656546592712, "learning_rate": 4.9977284411581816e-05, "loss": 0.8457, "num_input_tokens_seen": 4344320, "step": 7535 }, { "epoch": 1.1230265117664582, "grad_norm": 0.7237065434455872, "learning_rate": 4.997700659024099e-05, "loss": 0.807, "num_input_tokens_seen": 4347040, "step": 7540 }, { "epoch": 1.1237712243074174, "grad_norm": 0.5581052899360657, "learning_rate": 4.997672708105562e-05, "loss": 0.8284, "num_input_tokens_seen": 4350016, "step": 7545 }, { "epoch": 1.1245159368483766, "grad_norm": 0.7319459915161133, "learning_rate": 4.9976445884044575e-05, "loss": 0.8075, "num_input_tokens_seen": 4352768, "step": 7550 }, { "epoch": 1.1252606493893358, "grad_norm": 0.551694929599762, "learning_rate": 4.9976162999226865e-05, "loss": 0.7854, "num_input_tokens_seen": 4355520, "step": 7555 }, { "epoch": 1.126005361930295, "grad_norm": 0.6417450904846191, "learning_rate": 4.9975878426621605e-05, "loss": 0.8452, "num_input_tokens_seen": 4358272, "step": 7560 }, { "epoch": 1.1267500744712542, "grad_norm": 0.8932695984840393, "learning_rate": 4.9975592166248025e-05, "loss": 0.8832, "num_input_tokens_seen": 4361280, "step": 7565 }, { "epoch": 1.1274947870122132, "grad_norm": 0.6451692581176758, "learning_rate": 4.997530421812547e-05, "loss": 0.865, "num_input_tokens_seen": 4364416, "step": 7570 }, { "epoch": 1.1282394995531724, "grad_norm": 0.6895520091056824, "learning_rate": 4.997501458227339e-05, "loss": 0.8355, "num_input_tokens_seen": 4367360, "step": 7575 }, { "epoch": 1.1289842120941316, "grad_norm": 0.8117805123329163, "learning_rate": 4.997472325871138e-05, "loss": 0.8266, "num_input_tokens_seen": 4370304, "step": 7580 }, { "epoch": 1.1297289246350908, "grad_norm": 0.7006869912147522, "learning_rate": 4.9974430247459106e-05, "loss": 0.8208, "num_input_tokens_seen": 4373152, "step": 7585 }, { "epoch": 1.13047363717605, "grad_norm": 0.6509155035018921, "learning_rate": 4.997413554853637e-05, "loss": 0.7831, "num_input_tokens_seen": 4376096, "step": 7590 }, { "epoch": 1.1312183497170092, "grad_norm": 0.6695752143859863, "learning_rate": 4.99738391619631e-05, "loss": 0.8279, "num_input_tokens_seen": 4379040, "step": 7595 }, { "epoch": 1.1319630622579684, "grad_norm": 0.6637008190155029, "learning_rate": 4.997354108775931e-05, "loss": 0.7952, "num_input_tokens_seen": 4381664, "step": 7600 }, { "epoch": 1.1327077747989276, "grad_norm": 0.5242177248001099, "learning_rate": 4.997324132594515e-05, "loss": 0.831, "num_input_tokens_seen": 4384288, "step": 7605 }, { "epoch": 1.1334524873398868, "grad_norm": 0.683260977268219, "learning_rate": 4.997293987654087e-05, "loss": 0.8054, "num_input_tokens_seen": 4387264, "step": 7610 }, { "epoch": 1.134197199880846, "grad_norm": 0.5309320688247681, "learning_rate": 4.997263673956685e-05, "loss": 0.824, "num_input_tokens_seen": 4389952, "step": 7615 }, { "epoch": 1.1349419124218052, "grad_norm": 0.8656079173088074, "learning_rate": 4.9972331915043575e-05, "loss": 0.8306, "num_input_tokens_seen": 4393088, "step": 7620 }, { "epoch": 1.1356866249627644, "grad_norm": 1.0363644361495972, "learning_rate": 4.997202540299163e-05, "loss": 0.7993, "num_input_tokens_seen": 4395968, "step": 7625 }, { "epoch": 1.1364313375037236, "grad_norm": 0.8127846717834473, "learning_rate": 4.997171720343175e-05, "loss": 0.8437, "num_input_tokens_seen": 4398912, "step": 7630 }, { "epoch": 1.1371760500446828, "grad_norm": 0.54375159740448, "learning_rate": 4.9971407316384736e-05, "loss": 0.8458, "num_input_tokens_seen": 4401824, "step": 7635 }, { "epoch": 1.137920762585642, "grad_norm": 0.7076386213302612, "learning_rate": 4.997109574187154e-05, "loss": 0.8662, "num_input_tokens_seen": 4404704, "step": 7640 }, { "epoch": 1.1386654751266012, "grad_norm": 0.7381649613380432, "learning_rate": 4.997078247991323e-05, "loss": 0.7877, "num_input_tokens_seen": 4407936, "step": 7645 }, { "epoch": 1.1394101876675604, "grad_norm": 0.7592368125915527, "learning_rate": 4.9970467530530964e-05, "loss": 0.8253, "num_input_tokens_seen": 4410784, "step": 7650 }, { "epoch": 1.1401549002085196, "grad_norm": 0.6145147085189819, "learning_rate": 4.9970150893746016e-05, "loss": 0.8053, "num_input_tokens_seen": 4413440, "step": 7655 }, { "epoch": 1.1408996127494788, "grad_norm": 0.7004824876785278, "learning_rate": 4.99698325695798e-05, "loss": 0.8221, "num_input_tokens_seen": 4416000, "step": 7660 }, { "epoch": 1.1416443252904378, "grad_norm": 0.7066536545753479, "learning_rate": 4.996951255805381e-05, "loss": 0.8043, "num_input_tokens_seen": 4418816, "step": 7665 }, { "epoch": 1.142389037831397, "grad_norm": 0.8777879476547241, "learning_rate": 4.996919085918969e-05, "loss": 0.8446, "num_input_tokens_seen": 4421792, "step": 7670 }, { "epoch": 1.1431337503723562, "grad_norm": 0.7159916758537292, "learning_rate": 4.996886747300916e-05, "loss": 0.8232, "num_input_tokens_seen": 4424640, "step": 7675 }, { "epoch": 1.1438784629133154, "grad_norm": 0.8622274994850159, "learning_rate": 4.996854239953409e-05, "loss": 0.8431, "num_input_tokens_seen": 4427392, "step": 7680 }, { "epoch": 1.1446231754542746, "grad_norm": 0.6948341131210327, "learning_rate": 4.996821563878643e-05, "loss": 0.8067, "num_input_tokens_seen": 4430400, "step": 7685 }, { "epoch": 1.1453678879952338, "grad_norm": 0.6008358597755432, "learning_rate": 4.9967887190788274e-05, "loss": 0.814, "num_input_tokens_seen": 4432992, "step": 7690 }, { "epoch": 1.146112600536193, "grad_norm": 0.6785795092582703, "learning_rate": 4.996755705556182e-05, "loss": 0.7887, "num_input_tokens_seen": 4436032, "step": 7695 }, { "epoch": 1.1468573130771522, "grad_norm": 0.8640424013137817, "learning_rate": 4.9967225233129366e-05, "loss": 0.7958, "num_input_tokens_seen": 4438944, "step": 7700 }, { "epoch": 1.1476020256181114, "grad_norm": 0.546356201171875, "learning_rate": 4.9966891723513344e-05, "loss": 0.8484, "num_input_tokens_seen": 4441632, "step": 7705 }, { "epoch": 1.1483467381590706, "grad_norm": 0.6197919845581055, "learning_rate": 4.996655652673628e-05, "loss": 0.8018, "num_input_tokens_seen": 4444832, "step": 7710 }, { "epoch": 1.1490914507000298, "grad_norm": 0.881277322769165, "learning_rate": 4.9966219642820834e-05, "loss": 0.8185, "num_input_tokens_seen": 4447616, "step": 7715 }, { "epoch": 1.149836163240989, "grad_norm": 0.7558993697166443, "learning_rate": 4.996588107178977e-05, "loss": 0.8189, "num_input_tokens_seen": 4450496, "step": 7720 }, { "epoch": 1.1505808757819482, "grad_norm": 0.5707380175590515, "learning_rate": 4.996554081366597e-05, "loss": 0.82, "num_input_tokens_seen": 4453312, "step": 7725 }, { "epoch": 1.1513255883229074, "grad_norm": 0.934043288230896, "learning_rate": 4.996519886847243e-05, "loss": 0.8048, "num_input_tokens_seen": 4456608, "step": 7730 }, { "epoch": 1.1520703008638666, "grad_norm": 0.7868462800979614, "learning_rate": 4.996485523623224e-05, "loss": 0.7757, "num_input_tokens_seen": 4459744, "step": 7735 }, { "epoch": 1.1528150134048256, "grad_norm": 0.61797034740448, "learning_rate": 4.996450991696864e-05, "loss": 0.7813, "num_input_tokens_seen": 4462624, "step": 7740 }, { "epoch": 1.1535597259457848, "grad_norm": 0.5998008847236633, "learning_rate": 4.996416291070495e-05, "loss": 0.8236, "num_input_tokens_seen": 4465856, "step": 7745 }, { "epoch": 1.154304438486744, "grad_norm": 0.46448659896850586, "learning_rate": 4.996381421746464e-05, "loss": 0.852, "num_input_tokens_seen": 4468704, "step": 7750 }, { "epoch": 1.1550491510277032, "grad_norm": 0.8716573119163513, "learning_rate": 4.9963463837271254e-05, "loss": 0.7799, "num_input_tokens_seen": 4471776, "step": 7755 }, { "epoch": 1.1557938635686624, "grad_norm": 0.6691290736198425, "learning_rate": 4.996311177014847e-05, "loss": 0.8147, "num_input_tokens_seen": 4474848, "step": 7760 }, { "epoch": 1.1565385761096216, "grad_norm": 0.6875317096710205, "learning_rate": 4.9962758016120095e-05, "loss": 0.8075, "num_input_tokens_seen": 4477760, "step": 7765 }, { "epoch": 1.1572832886505808, "grad_norm": 0.5504992604255676, "learning_rate": 4.996240257521002e-05, "loss": 0.837, "num_input_tokens_seen": 4480800, "step": 7770 }, { "epoch": 1.15802800119154, "grad_norm": 0.7436399459838867, "learning_rate": 4.996204544744227e-05, "loss": 0.7862, "num_input_tokens_seen": 4483936, "step": 7775 }, { "epoch": 1.1587727137324992, "grad_norm": 0.5935561060905457, "learning_rate": 4.9961686632840976e-05, "loss": 0.7619, "num_input_tokens_seen": 4486752, "step": 7780 }, { "epoch": 1.1595174262734584, "grad_norm": 0.547564685344696, "learning_rate": 4.9961326131430386e-05, "loss": 0.8385, "num_input_tokens_seen": 4489472, "step": 7785 }, { "epoch": 1.1602621388144176, "grad_norm": 0.5691767930984497, "learning_rate": 4.996096394323486e-05, "loss": 0.8328, "num_input_tokens_seen": 4492256, "step": 7790 }, { "epoch": 1.1610068513553768, "grad_norm": 0.5965315103530884, "learning_rate": 4.9960600068278876e-05, "loss": 0.8227, "num_input_tokens_seen": 4495072, "step": 7795 }, { "epoch": 1.161751563896336, "grad_norm": 0.6540803909301758, "learning_rate": 4.9960234506587024e-05, "loss": 0.8044, "num_input_tokens_seen": 4498656, "step": 7800 }, { "epoch": 1.1624962764372953, "grad_norm": 0.4790574312210083, "learning_rate": 4.9959867258184e-05, "loss": 0.7833, "num_input_tokens_seen": 4501280, "step": 7805 }, { "epoch": 1.1632409889782545, "grad_norm": 0.6741700768470764, "learning_rate": 4.9959498323094636e-05, "loss": 0.7774, "num_input_tokens_seen": 4504064, "step": 7810 }, { "epoch": 1.1639857015192137, "grad_norm": 0.7495623826980591, "learning_rate": 4.9959127701343844e-05, "loss": 0.8343, "num_input_tokens_seen": 4506816, "step": 7815 }, { "epoch": 1.1647304140601729, "grad_norm": 0.6158543229103088, "learning_rate": 4.995875539295668e-05, "loss": 0.818, "num_input_tokens_seen": 4509280, "step": 7820 }, { "epoch": 1.165475126601132, "grad_norm": 0.5748996734619141, "learning_rate": 4.9958381397958305e-05, "loss": 0.8157, "num_input_tokens_seen": 4512192, "step": 7825 }, { "epoch": 1.1662198391420913, "grad_norm": 0.4947417676448822, "learning_rate": 4.995800571637399e-05, "loss": 0.7991, "num_input_tokens_seen": 4515008, "step": 7830 }, { "epoch": 1.1669645516830505, "grad_norm": 0.5039109587669373, "learning_rate": 4.995762834822911e-05, "loss": 0.8208, "num_input_tokens_seen": 4517856, "step": 7835 }, { "epoch": 1.1677092642240094, "grad_norm": 0.898036777973175, "learning_rate": 4.995724929354918e-05, "loss": 0.8268, "num_input_tokens_seen": 4521024, "step": 7840 }, { "epoch": 1.1684539767649686, "grad_norm": 0.5417826771736145, "learning_rate": 4.9956868552359816e-05, "loss": 0.8155, "num_input_tokens_seen": 4523840, "step": 7845 }, { "epoch": 1.1691986893059279, "grad_norm": 0.6625925898551941, "learning_rate": 4.995648612468674e-05, "loss": 0.8332, "num_input_tokens_seen": 4526528, "step": 7850 }, { "epoch": 1.169943401846887, "grad_norm": 0.6963664293289185, "learning_rate": 4.9956102010555806e-05, "loss": 0.8323, "num_input_tokens_seen": 4529312, "step": 7855 }, { "epoch": 1.1706881143878463, "grad_norm": 1.5032198429107666, "learning_rate": 4.9955716209992956e-05, "loss": 0.8275, "num_input_tokens_seen": 4531840, "step": 7860 }, { "epoch": 1.1714328269288055, "grad_norm": 0.725410521030426, "learning_rate": 4.9955328723024263e-05, "loss": 0.8053, "num_input_tokens_seen": 4534752, "step": 7865 }, { "epoch": 1.1721775394697647, "grad_norm": 0.4589863419532776, "learning_rate": 4.995493954967592e-05, "loss": 0.8045, "num_input_tokens_seen": 4537408, "step": 7870 }, { "epoch": 1.1729222520107239, "grad_norm": 0.6909633874893188, "learning_rate": 4.995454868997421e-05, "loss": 0.8223, "num_input_tokens_seen": 4540736, "step": 7875 }, { "epoch": 1.173666964551683, "grad_norm": 0.6749770045280457, "learning_rate": 4.9954156143945575e-05, "loss": 0.8289, "num_input_tokens_seen": 4543584, "step": 7880 }, { "epoch": 1.1744116770926423, "grad_norm": 0.8407008051872253, "learning_rate": 4.9953761911616515e-05, "loss": 0.7953, "num_input_tokens_seen": 4546656, "step": 7885 }, { "epoch": 1.1751563896336015, "grad_norm": 0.7800027132034302, "learning_rate": 4.995336599301368e-05, "loss": 0.8302, "num_input_tokens_seen": 4549376, "step": 7890 }, { "epoch": 1.1759011021745607, "grad_norm": 0.6522210240364075, "learning_rate": 4.9952968388163826e-05, "loss": 0.884, "num_input_tokens_seen": 4552128, "step": 7895 }, { "epoch": 1.1766458147155199, "grad_norm": 0.5853920578956604, "learning_rate": 4.995256909709382e-05, "loss": 0.7905, "num_input_tokens_seen": 4554720, "step": 7900 }, { "epoch": 1.177390527256479, "grad_norm": 0.6291080117225647, "learning_rate": 4.9952168119830644e-05, "loss": 0.8165, "num_input_tokens_seen": 4557472, "step": 7905 }, { "epoch": 1.1781352397974383, "grad_norm": 0.6305346488952637, "learning_rate": 4.995176545640139e-05, "loss": 0.8604, "num_input_tokens_seen": 4560576, "step": 7910 }, { "epoch": 1.1788799523383973, "grad_norm": 0.7762690782546997, "learning_rate": 4.995136110683328e-05, "loss": 0.8018, "num_input_tokens_seen": 4563552, "step": 7915 }, { "epoch": 1.1796246648793565, "grad_norm": 0.5235304236412048, "learning_rate": 4.995095507115363e-05, "loss": 0.7966, "num_input_tokens_seen": 4566208, "step": 7920 }, { "epoch": 1.1803693774203157, "grad_norm": 0.6571189761161804, "learning_rate": 4.9950547349389873e-05, "loss": 0.8565, "num_input_tokens_seen": 4569248, "step": 7925 }, { "epoch": 1.1811140899612749, "grad_norm": 0.6478996872901917, "learning_rate": 4.995013794156957e-05, "loss": 0.8209, "num_input_tokens_seen": 4571904, "step": 7930 }, { "epoch": 1.181858802502234, "grad_norm": 0.6671205759048462, "learning_rate": 4.994972684772039e-05, "loss": 0.8495, "num_input_tokens_seen": 4574688, "step": 7935 }, { "epoch": 1.1826035150431933, "grad_norm": 0.745535135269165, "learning_rate": 4.9949314067870105e-05, "loss": 0.8236, "num_input_tokens_seen": 4577504, "step": 7940 }, { "epoch": 1.1833482275841525, "grad_norm": 0.7800391316413879, "learning_rate": 4.9948899602046614e-05, "loss": 0.8141, "num_input_tokens_seen": 4580384, "step": 7945 }, { "epoch": 1.1840929401251117, "grad_norm": 0.8534844517707825, "learning_rate": 4.9948483450277915e-05, "loss": 0.7824, "num_input_tokens_seen": 4583456, "step": 7950 }, { "epoch": 1.1848376526660709, "grad_norm": 0.6138526797294617, "learning_rate": 4.9948065612592145e-05, "loss": 0.822, "num_input_tokens_seen": 4586336, "step": 7955 }, { "epoch": 1.18558236520703, "grad_norm": 0.8092853426933289, "learning_rate": 4.9947646089017534e-05, "loss": 0.8015, "num_input_tokens_seen": 4588832, "step": 7960 }, { "epoch": 1.1863270777479893, "grad_norm": 0.6217524409294128, "learning_rate": 4.994722487958242e-05, "loss": 0.8147, "num_input_tokens_seen": 4591552, "step": 7965 }, { "epoch": 1.1870717902889485, "grad_norm": 0.8574263453483582, "learning_rate": 4.994680198431528e-05, "loss": 0.8222, "num_input_tokens_seen": 4594752, "step": 7970 }, { "epoch": 1.1878165028299077, "grad_norm": 0.6739116311073303, "learning_rate": 4.9946377403244695e-05, "loss": 0.8451, "num_input_tokens_seen": 4597792, "step": 7975 }, { "epoch": 1.188561215370867, "grad_norm": 0.9244599938392639, "learning_rate": 4.994595113639935e-05, "loss": 0.852, "num_input_tokens_seen": 4600544, "step": 7980 }, { "epoch": 1.189305927911826, "grad_norm": 0.8036144375801086, "learning_rate": 4.994552318380804e-05, "loss": 0.8367, "num_input_tokens_seen": 4603392, "step": 7985 }, { "epoch": 1.1900506404527853, "grad_norm": 0.6471827626228333, "learning_rate": 4.9945093545499706e-05, "loss": 0.9051, "num_input_tokens_seen": 4606112, "step": 7990 }, { "epoch": 1.1907953529937445, "grad_norm": 0.9130696654319763, "learning_rate": 4.9944662221503364e-05, "loss": 0.8579, "num_input_tokens_seen": 4609120, "step": 7995 }, { "epoch": 1.1915400655347037, "grad_norm": 0.5308722257614136, "learning_rate": 4.9944229211848166e-05, "loss": 0.8128, "num_input_tokens_seen": 4611904, "step": 8000 }, { "epoch": 1.192284778075663, "grad_norm": 0.8715817928314209, "learning_rate": 4.9943794516563366e-05, "loss": 0.8095, "num_input_tokens_seen": 4614528, "step": 8005 }, { "epoch": 1.193029490616622, "grad_norm": 0.7474427223205566, "learning_rate": 4.9943358135678366e-05, "loss": 0.7958, "num_input_tokens_seen": 4617408, "step": 8010 }, { "epoch": 1.193774203157581, "grad_norm": 0.6513242721557617, "learning_rate": 4.994292006922262e-05, "loss": 0.8023, "num_input_tokens_seen": 4620256, "step": 8015 }, { "epoch": 1.1945189156985403, "grad_norm": 0.5626654624938965, "learning_rate": 4.994248031722575e-05, "loss": 0.848, "num_input_tokens_seen": 4623136, "step": 8020 }, { "epoch": 1.1952636282394995, "grad_norm": 0.7236822247505188, "learning_rate": 4.994203887971747e-05, "loss": 0.8241, "num_input_tokens_seen": 4625792, "step": 8025 }, { "epoch": 1.1960083407804587, "grad_norm": 0.8803151249885559, "learning_rate": 4.994159575672761e-05, "loss": 0.829, "num_input_tokens_seen": 4628544, "step": 8030 }, { "epoch": 1.196753053321418, "grad_norm": 0.6995881199836731, "learning_rate": 4.9941150948286106e-05, "loss": 0.7726, "num_input_tokens_seen": 4631232, "step": 8035 }, { "epoch": 1.197497765862377, "grad_norm": 0.7069797515869141, "learning_rate": 4.994070445442304e-05, "loss": 0.8259, "num_input_tokens_seen": 4634144, "step": 8040 }, { "epoch": 1.1982424784033363, "grad_norm": 0.8324509263038635, "learning_rate": 4.994025627516856e-05, "loss": 0.8137, "num_input_tokens_seen": 4637088, "step": 8045 }, { "epoch": 1.1989871909442955, "grad_norm": 0.6798885464668274, "learning_rate": 4.9939806410552955e-05, "loss": 0.8447, "num_input_tokens_seen": 4639840, "step": 8050 }, { "epoch": 1.1997319034852547, "grad_norm": 0.6832720637321472, "learning_rate": 4.9939354860606636e-05, "loss": 0.8083, "num_input_tokens_seen": 4642400, "step": 8055 }, { "epoch": 1.200476616026214, "grad_norm": 0.5935811996459961, "learning_rate": 4.9938901625360115e-05, "loss": 0.7899, "num_input_tokens_seen": 4644960, "step": 8060 }, { "epoch": 1.2012213285671731, "grad_norm": 0.49295973777770996, "learning_rate": 4.993844670484401e-05, "loss": 0.8177, "num_input_tokens_seen": 4647680, "step": 8065 }, { "epoch": 1.2019660411081323, "grad_norm": 0.9821552634239197, "learning_rate": 4.993799009908907e-05, "loss": 0.8474, "num_input_tokens_seen": 4650496, "step": 8070 }, { "epoch": 1.2027107536490915, "grad_norm": 0.7642701864242554, "learning_rate": 4.9937531808126155e-05, "loss": 0.8342, "num_input_tokens_seen": 4653312, "step": 8075 }, { "epoch": 1.2034554661900507, "grad_norm": 0.7468724846839905, "learning_rate": 4.993707183198623e-05, "loss": 0.8578, "num_input_tokens_seen": 4656064, "step": 8080 }, { "epoch": 1.2042001787310097, "grad_norm": 0.5705596208572388, "learning_rate": 4.993661017070037e-05, "loss": 0.8304, "num_input_tokens_seen": 4658720, "step": 8085 }, { "epoch": 1.204944891271969, "grad_norm": 0.8409830331802368, "learning_rate": 4.993614682429978e-05, "loss": 0.8095, "num_input_tokens_seen": 4662176, "step": 8090 }, { "epoch": 1.2056896038129281, "grad_norm": 0.7900805473327637, "learning_rate": 4.993568179281577e-05, "loss": 0.8409, "num_input_tokens_seen": 4665152, "step": 8095 }, { "epoch": 1.2064343163538873, "grad_norm": 0.8748136758804321, "learning_rate": 4.9935215076279766e-05, "loss": 0.7856, "num_input_tokens_seen": 4668096, "step": 8100 }, { "epoch": 1.2071790288948465, "grad_norm": 0.7343164682388306, "learning_rate": 4.993474667472331e-05, "loss": 0.822, "num_input_tokens_seen": 4671040, "step": 8105 }, { "epoch": 1.2079237414358057, "grad_norm": 0.5212990045547485, "learning_rate": 4.9934276588178054e-05, "loss": 0.8376, "num_input_tokens_seen": 4673952, "step": 8110 }, { "epoch": 1.208668453976765, "grad_norm": 0.7917633652687073, "learning_rate": 4.993380481667576e-05, "loss": 0.8363, "num_input_tokens_seen": 4676928, "step": 8115 }, { "epoch": 1.2094131665177241, "grad_norm": 0.7203346490859985, "learning_rate": 4.9933331360248306e-05, "loss": 0.8031, "num_input_tokens_seen": 4679776, "step": 8120 }, { "epoch": 1.2101578790586833, "grad_norm": 0.7553468942642212, "learning_rate": 4.993285621892769e-05, "loss": 0.7751, "num_input_tokens_seen": 4682624, "step": 8125 }, { "epoch": 1.2109025915996425, "grad_norm": 0.5618879795074463, "learning_rate": 4.993237939274602e-05, "loss": 0.8465, "num_input_tokens_seen": 4685280, "step": 8130 }, { "epoch": 1.2116473041406017, "grad_norm": 0.607506275177002, "learning_rate": 4.9931900881735517e-05, "loss": 0.8068, "num_input_tokens_seen": 4688192, "step": 8135 }, { "epoch": 1.212392016681561, "grad_norm": 0.5211078524589539, "learning_rate": 4.993142068592852e-05, "loss": 0.8195, "num_input_tokens_seen": 4691008, "step": 8140 }, { "epoch": 1.2131367292225201, "grad_norm": 0.6175633072853088, "learning_rate": 4.993093880535748e-05, "loss": 0.8259, "num_input_tokens_seen": 4694080, "step": 8145 }, { "epoch": 1.2138814417634793, "grad_norm": 0.78282630443573, "learning_rate": 4.993045524005496e-05, "loss": 0.8141, "num_input_tokens_seen": 4696896, "step": 8150 }, { "epoch": 1.2146261543044385, "grad_norm": 0.7988092303276062, "learning_rate": 4.992996999005363e-05, "loss": 0.8374, "num_input_tokens_seen": 4699712, "step": 8155 }, { "epoch": 1.2153708668453977, "grad_norm": 0.7768690586090088, "learning_rate": 4.992948305538628e-05, "loss": 0.7808, "num_input_tokens_seen": 4702528, "step": 8160 }, { "epoch": 1.216115579386357, "grad_norm": 0.5621901750564575, "learning_rate": 4.992899443608583e-05, "loss": 0.8027, "num_input_tokens_seen": 4705024, "step": 8165 }, { "epoch": 1.2168602919273162, "grad_norm": 0.6822494864463806, "learning_rate": 4.9928504132185284e-05, "loss": 0.7949, "num_input_tokens_seen": 4707776, "step": 8170 }, { "epoch": 1.2176050044682754, "grad_norm": 0.6890209317207336, "learning_rate": 4.992801214371778e-05, "loss": 0.8512, "num_input_tokens_seen": 4710496, "step": 8175 }, { "epoch": 1.2183497170092346, "grad_norm": 0.7144855260848999, "learning_rate": 4.992751847071657e-05, "loss": 0.8293, "num_input_tokens_seen": 4713632, "step": 8180 }, { "epoch": 1.2190944295501935, "grad_norm": 0.7436007857322693, "learning_rate": 4.992702311321501e-05, "loss": 0.8565, "num_input_tokens_seen": 4716512, "step": 8185 }, { "epoch": 1.2198391420911527, "grad_norm": 0.7433807849884033, "learning_rate": 4.992652607124658e-05, "loss": 0.8045, "num_input_tokens_seen": 4719392, "step": 8190 }, { "epoch": 1.220583854632112, "grad_norm": 0.518399715423584, "learning_rate": 4.992602734484485e-05, "loss": 0.7788, "num_input_tokens_seen": 4722304, "step": 8195 }, { "epoch": 1.2213285671730711, "grad_norm": 0.6026225686073303, "learning_rate": 4.992552693404354e-05, "loss": 0.8327, "num_input_tokens_seen": 4724960, "step": 8200 }, { "epoch": 1.2220732797140303, "grad_norm": 0.44914713501930237, "learning_rate": 4.992502483887645e-05, "loss": 0.8113, "num_input_tokens_seen": 4727808, "step": 8205 }, { "epoch": 1.2228179922549895, "grad_norm": 0.8700822591781616, "learning_rate": 4.9924521059377535e-05, "loss": 0.8167, "num_input_tokens_seen": 4730944, "step": 8210 }, { "epoch": 1.2235627047959488, "grad_norm": 0.5337986350059509, "learning_rate": 4.992401559558081e-05, "loss": 0.8412, "num_input_tokens_seen": 4733664, "step": 8215 }, { "epoch": 1.224307417336908, "grad_norm": 0.6733003854751587, "learning_rate": 4.992350844752045e-05, "loss": 0.8326, "num_input_tokens_seen": 4736608, "step": 8220 }, { "epoch": 1.2250521298778672, "grad_norm": 0.7003393769264221, "learning_rate": 4.9922999615230726e-05, "loss": 0.8705, "num_input_tokens_seen": 4739904, "step": 8225 }, { "epoch": 1.2257968424188264, "grad_norm": 0.5208679437637329, "learning_rate": 4.992248909874601e-05, "loss": 0.8062, "num_input_tokens_seen": 4742784, "step": 8230 }, { "epoch": 1.2265415549597856, "grad_norm": 0.5086901783943176, "learning_rate": 4.992197689810081e-05, "loss": 0.8273, "num_input_tokens_seen": 4745696, "step": 8235 }, { "epoch": 1.2272862675007448, "grad_norm": 0.6784970760345459, "learning_rate": 4.9921463013329736e-05, "loss": 0.8301, "num_input_tokens_seen": 4748800, "step": 8240 }, { "epoch": 1.228030980041704, "grad_norm": 0.542657732963562, "learning_rate": 4.9920947444467515e-05, "loss": 0.8069, "num_input_tokens_seen": 4751712, "step": 8245 }, { "epoch": 1.2287756925826632, "grad_norm": 0.6828400492668152, "learning_rate": 4.9920430191548986e-05, "loss": 0.7819, "num_input_tokens_seen": 4754464, "step": 8250 }, { "epoch": 1.2295204051236224, "grad_norm": 0.6036034822463989, "learning_rate": 4.9919911254609105e-05, "loss": 0.7996, "num_input_tokens_seen": 4757280, "step": 8255 }, { "epoch": 1.2302651176645814, "grad_norm": 0.563001275062561, "learning_rate": 4.991939063368294e-05, "loss": 0.8499, "num_input_tokens_seen": 4760128, "step": 8260 }, { "epoch": 1.2310098302055406, "grad_norm": 0.47075730562210083, "learning_rate": 4.991886832880567e-05, "loss": 0.8205, "num_input_tokens_seen": 4762976, "step": 8265 }, { "epoch": 1.2317545427464998, "grad_norm": 0.6646090149879456, "learning_rate": 4.9918344340012584e-05, "loss": 0.7532, "num_input_tokens_seen": 4765920, "step": 8270 }, { "epoch": 1.232499255287459, "grad_norm": 0.48896390199661255, "learning_rate": 4.99178186673391e-05, "loss": 0.8422, "num_input_tokens_seen": 4768736, "step": 8275 }, { "epoch": 1.2332439678284182, "grad_norm": 0.6209976077079773, "learning_rate": 4.9917291310820745e-05, "loss": 0.7711, "num_input_tokens_seen": 4771488, "step": 8280 }, { "epoch": 1.2339886803693774, "grad_norm": 0.574951171875, "learning_rate": 4.9916762270493154e-05, "loss": 0.8439, "num_input_tokens_seen": 4774368, "step": 8285 }, { "epoch": 1.2347333929103366, "grad_norm": 0.9231170415878296, "learning_rate": 4.991623154639207e-05, "loss": 0.8055, "num_input_tokens_seen": 4777216, "step": 8290 }, { "epoch": 1.2354781054512958, "grad_norm": 0.5322160720825195, "learning_rate": 4.991569913855335e-05, "loss": 0.8043, "num_input_tokens_seen": 4780192, "step": 8295 }, { "epoch": 1.236222817992255, "grad_norm": 0.769310474395752, "learning_rate": 4.991516504701299e-05, "loss": 0.8386, "num_input_tokens_seen": 4783232, "step": 8300 }, { "epoch": 1.2369675305332142, "grad_norm": 0.5404117703437805, "learning_rate": 4.991462927180707e-05, "loss": 0.8332, "num_input_tokens_seen": 4786208, "step": 8305 }, { "epoch": 1.2377122430741734, "grad_norm": 0.6202139854431152, "learning_rate": 4.991409181297181e-05, "loss": 0.82, "num_input_tokens_seen": 4789024, "step": 8310 }, { "epoch": 1.2384569556151326, "grad_norm": 0.8083113431930542, "learning_rate": 4.991355267054351e-05, "loss": 0.8667, "num_input_tokens_seen": 4791808, "step": 8315 }, { "epoch": 1.2392016681560918, "grad_norm": 0.48623862862586975, "learning_rate": 4.991301184455861e-05, "loss": 0.7702, "num_input_tokens_seen": 4794592, "step": 8320 }, { "epoch": 1.239946380697051, "grad_norm": 0.6354064345359802, "learning_rate": 4.9912469335053656e-05, "loss": 0.8186, "num_input_tokens_seen": 4797664, "step": 8325 }, { "epoch": 1.2406910932380102, "grad_norm": 0.6889854073524475, "learning_rate": 4.991192514206532e-05, "loss": 0.781, "num_input_tokens_seen": 4800352, "step": 8330 }, { "epoch": 1.2414358057789694, "grad_norm": 0.6505404710769653, "learning_rate": 4.991137926563036e-05, "loss": 0.8287, "num_input_tokens_seen": 4803040, "step": 8335 }, { "epoch": 1.2421805183199286, "grad_norm": 0.4441041052341461, "learning_rate": 4.991083170578568e-05, "loss": 0.8321, "num_input_tokens_seen": 4806048, "step": 8340 }, { "epoch": 1.2429252308608878, "grad_norm": 0.563078761100769, "learning_rate": 4.991028246256826e-05, "loss": 0.7981, "num_input_tokens_seen": 4808864, "step": 8345 }, { "epoch": 1.243669943401847, "grad_norm": 0.5500127673149109, "learning_rate": 4.9909731536015235e-05, "loss": 0.7828, "num_input_tokens_seen": 4811680, "step": 8350 }, { "epoch": 1.244414655942806, "grad_norm": 0.6147940754890442, "learning_rate": 4.9909178926163835e-05, "loss": 0.851, "num_input_tokens_seen": 4814880, "step": 8355 }, { "epoch": 1.2451593684837652, "grad_norm": 0.6519120931625366, "learning_rate": 4.9908624633051395e-05, "loss": 0.8247, "num_input_tokens_seen": 4818048, "step": 8360 }, { "epoch": 1.2459040810247244, "grad_norm": 0.6190519332885742, "learning_rate": 4.990806865671537e-05, "loss": 0.7685, "num_input_tokens_seen": 4821248, "step": 8365 }, { "epoch": 1.2466487935656836, "grad_norm": 0.6958194375038147, "learning_rate": 4.990751099719333e-05, "loss": 0.8034, "num_input_tokens_seen": 4824160, "step": 8370 }, { "epoch": 1.2473935061066428, "grad_norm": 0.907314121723175, "learning_rate": 4.990695165452297e-05, "loss": 0.8345, "num_input_tokens_seen": 4826848, "step": 8375 }, { "epoch": 1.248138218647602, "grad_norm": 0.6906267404556274, "learning_rate": 4.990639062874208e-05, "loss": 0.8105, "num_input_tokens_seen": 4829696, "step": 8380 }, { "epoch": 1.2488829311885612, "grad_norm": 0.47865983843803406, "learning_rate": 4.990582791988857e-05, "loss": 0.8134, "num_input_tokens_seen": 4832512, "step": 8385 }, { "epoch": 1.2496276437295204, "grad_norm": 0.5241755247116089, "learning_rate": 4.990526352800047e-05, "loss": 0.7777, "num_input_tokens_seen": 4835488, "step": 8390 }, { "epoch": 1.2503723562704796, "grad_norm": 0.5202213525772095, "learning_rate": 4.990469745311592e-05, "loss": 0.8302, "num_input_tokens_seen": 4838304, "step": 8395 }, { "epoch": 1.2511170688114388, "grad_norm": 0.48262444138526917, "learning_rate": 4.990412969527317e-05, "loss": 0.8204, "num_input_tokens_seen": 4840832, "step": 8400 }, { "epoch": 1.251861781352398, "grad_norm": 0.609626293182373, "learning_rate": 4.99035602545106e-05, "loss": 0.8216, "num_input_tokens_seen": 4843680, "step": 8405 }, { "epoch": 1.2526064938933572, "grad_norm": 0.6750274896621704, "learning_rate": 4.990298913086666e-05, "loss": 0.8138, "num_input_tokens_seen": 4846720, "step": 8410 }, { "epoch": 1.2533512064343164, "grad_norm": 0.6575154662132263, "learning_rate": 4.990241632437997e-05, "loss": 0.8054, "num_input_tokens_seen": 4849728, "step": 8415 }, { "epoch": 1.2540959189752756, "grad_norm": 0.5914298295974731, "learning_rate": 4.990184183508923e-05, "loss": 0.7859, "num_input_tokens_seen": 4852800, "step": 8420 }, { "epoch": 1.2548406315162346, "grad_norm": 0.5246714949607849, "learning_rate": 4.990126566303326e-05, "loss": 0.8729, "num_input_tokens_seen": 4855776, "step": 8425 }, { "epoch": 1.2555853440571938, "grad_norm": 0.6232919096946716, "learning_rate": 4.9900687808251e-05, "loss": 0.8466, "num_input_tokens_seen": 4858592, "step": 8430 }, { "epoch": 1.256330056598153, "grad_norm": 0.48597413301467896, "learning_rate": 4.99001082707815e-05, "loss": 0.7796, "num_input_tokens_seen": 4861408, "step": 8435 }, { "epoch": 1.2570747691391122, "grad_norm": 0.6760628819465637, "learning_rate": 4.989952705066392e-05, "loss": 0.833, "num_input_tokens_seen": 4864224, "step": 8440 }, { "epoch": 1.2578194816800714, "grad_norm": 0.6125149726867676, "learning_rate": 4.9898944147937534e-05, "loss": 0.8045, "num_input_tokens_seen": 4866944, "step": 8445 }, { "epoch": 1.2585641942210306, "grad_norm": 0.6491338610649109, "learning_rate": 4.989835956264173e-05, "loss": 0.8172, "num_input_tokens_seen": 4870016, "step": 8450 }, { "epoch": 1.2593089067619898, "grad_norm": 0.6707763075828552, "learning_rate": 4.989777329481602e-05, "loss": 0.8241, "num_input_tokens_seen": 4872864, "step": 8455 }, { "epoch": 1.260053619302949, "grad_norm": 0.5394715666770935, "learning_rate": 4.989718534450002e-05, "loss": 0.7915, "num_input_tokens_seen": 4875584, "step": 8460 }, { "epoch": 1.2607983318439082, "grad_norm": 0.6056733131408691, "learning_rate": 4.989659571173345e-05, "loss": 0.8347, "num_input_tokens_seen": 4878400, "step": 8465 }, { "epoch": 1.2615430443848674, "grad_norm": 0.5147128701210022, "learning_rate": 4.9896004396556176e-05, "loss": 0.8245, "num_input_tokens_seen": 4881440, "step": 8470 }, { "epoch": 1.2622877569258266, "grad_norm": 0.6486163139343262, "learning_rate": 4.989541139900814e-05, "loss": 0.8039, "num_input_tokens_seen": 4884608, "step": 8475 }, { "epoch": 1.2630324694667858, "grad_norm": 0.5453237891197205, "learning_rate": 4.989481671912941e-05, "loss": 0.8104, "num_input_tokens_seen": 4887520, "step": 8480 }, { "epoch": 1.263777182007745, "grad_norm": 0.5239493250846863, "learning_rate": 4.989422035696019e-05, "loss": 0.8221, "num_input_tokens_seen": 4890432, "step": 8485 }, { "epoch": 1.2645218945487042, "grad_norm": 0.5221325159072876, "learning_rate": 4.9893622312540764e-05, "loss": 0.8054, "num_input_tokens_seen": 4893056, "step": 8490 }, { "epoch": 1.2652666070896634, "grad_norm": 0.6615121364593506, "learning_rate": 4.989302258591157e-05, "loss": 0.8021, "num_input_tokens_seen": 4896096, "step": 8495 }, { "epoch": 1.2660113196306226, "grad_norm": 0.6779914498329163, "learning_rate": 4.98924211771131e-05, "loss": 0.7889, "num_input_tokens_seen": 4899232, "step": 8500 }, { "epoch": 1.2667560321715818, "grad_norm": 0.5960347056388855, "learning_rate": 4.9891818086186014e-05, "loss": 0.8071, "num_input_tokens_seen": 4901920, "step": 8505 }, { "epoch": 1.267500744712541, "grad_norm": 0.5542863607406616, "learning_rate": 4.989121331317107e-05, "loss": 0.8751, "num_input_tokens_seen": 4904800, "step": 8510 }, { "epoch": 1.2682454572535002, "grad_norm": 0.5408080220222473, "learning_rate": 4.9890606858109126e-05, "loss": 0.7915, "num_input_tokens_seen": 4907456, "step": 8515 }, { "epoch": 1.2689901697944594, "grad_norm": 0.7140949964523315, "learning_rate": 4.9889998721041173e-05, "loss": 0.8085, "num_input_tokens_seen": 4910464, "step": 8520 }, { "epoch": 1.2697348823354186, "grad_norm": 0.6078310608863831, "learning_rate": 4.98893889020083e-05, "loss": 0.8441, "num_input_tokens_seen": 4913600, "step": 8525 }, { "epoch": 1.2704795948763778, "grad_norm": 0.6100184917449951, "learning_rate": 4.988877740105171e-05, "loss": 0.7687, "num_input_tokens_seen": 4916544, "step": 8530 }, { "epoch": 1.2712243074173368, "grad_norm": 0.5942439436912537, "learning_rate": 4.9888164218212746e-05, "loss": 0.8407, "num_input_tokens_seen": 4919232, "step": 8535 }, { "epoch": 1.271969019958296, "grad_norm": 0.5860390663146973, "learning_rate": 4.988754935353282e-05, "loss": 0.8465, "num_input_tokens_seen": 4921984, "step": 8540 }, { "epoch": 1.2727137324992552, "grad_norm": 0.6952153444290161, "learning_rate": 4.988693280705351e-05, "loss": 0.8574, "num_input_tokens_seen": 4924896, "step": 8545 }, { "epoch": 1.2734584450402144, "grad_norm": 0.7284922003746033, "learning_rate": 4.988631457881645e-05, "loss": 0.8463, "num_input_tokens_seen": 4927616, "step": 8550 }, { "epoch": 1.2742031575811736, "grad_norm": 0.7748461365699768, "learning_rate": 4.9885694668863435e-05, "loss": 0.8415, "num_input_tokens_seen": 4930624, "step": 8555 }, { "epoch": 1.2749478701221328, "grad_norm": 0.7129857540130615, "learning_rate": 4.9885073077236354e-05, "loss": 0.8069, "num_input_tokens_seen": 4933376, "step": 8560 }, { "epoch": 1.275692582663092, "grad_norm": 0.6845942735671997, "learning_rate": 4.988444980397721e-05, "loss": 0.8143, "num_input_tokens_seen": 4936256, "step": 8565 }, { "epoch": 1.2764372952040512, "grad_norm": 0.67442387342453, "learning_rate": 4.9883824849128125e-05, "loss": 0.7877, "num_input_tokens_seen": 4938944, "step": 8570 }, { "epoch": 1.2771820077450105, "grad_norm": 0.6372007131576538, "learning_rate": 4.988319821273132e-05, "loss": 0.826, "num_input_tokens_seen": 4942528, "step": 8575 }, { "epoch": 1.2779267202859697, "grad_norm": 0.8124439716339111, "learning_rate": 4.9882569894829144e-05, "loss": 0.8423, "num_input_tokens_seen": 4945280, "step": 8580 }, { "epoch": 1.2786714328269289, "grad_norm": 0.7967134714126587, "learning_rate": 4.988193989546407e-05, "loss": 0.8429, "num_input_tokens_seen": 4948352, "step": 8585 }, { "epoch": 1.279416145367888, "grad_norm": 0.6615080833435059, "learning_rate": 4.988130821467866e-05, "loss": 0.8233, "num_input_tokens_seen": 4951296, "step": 8590 }, { "epoch": 1.2801608579088473, "grad_norm": 0.4855306148529053, "learning_rate": 4.988067485251559e-05, "loss": 0.8264, "num_input_tokens_seen": 4953984, "step": 8595 }, { "epoch": 1.2809055704498062, "grad_norm": 0.7555674314498901, "learning_rate": 4.988003980901768e-05, "loss": 0.8242, "num_input_tokens_seen": 4956736, "step": 8600 }, { "epoch": 1.2816502829907654, "grad_norm": 0.6322144865989685, "learning_rate": 4.987940308422783e-05, "loss": 0.7967, "num_input_tokens_seen": 4959712, "step": 8605 }, { "epoch": 1.2823949955317246, "grad_norm": 0.6408448219299316, "learning_rate": 4.9878764678189075e-05, "loss": 0.8059, "num_input_tokens_seen": 4962816, "step": 8610 }, { "epoch": 1.2831397080726838, "grad_norm": 0.5807810425758362, "learning_rate": 4.9878124590944555e-05, "loss": 0.8447, "num_input_tokens_seen": 4965952, "step": 8615 }, { "epoch": 1.283884420613643, "grad_norm": 0.5500701665878296, "learning_rate": 4.9877482822537516e-05, "loss": 0.7697, "num_input_tokens_seen": 4968832, "step": 8620 }, { "epoch": 1.2846291331546023, "grad_norm": 0.6481621861457825, "learning_rate": 4.987683937301133e-05, "loss": 0.8497, "num_input_tokens_seen": 4971744, "step": 8625 }, { "epoch": 1.2853738456955615, "grad_norm": 0.5872336030006409, "learning_rate": 4.987619424240949e-05, "loss": 0.8491, "num_input_tokens_seen": 4974368, "step": 8630 }, { "epoch": 1.2861185582365207, "grad_norm": 0.6076794862747192, "learning_rate": 4.9875547430775575e-05, "loss": 0.8351, "num_input_tokens_seen": 4977440, "step": 8635 }, { "epoch": 1.2868632707774799, "grad_norm": 0.8209302425384521, "learning_rate": 4.98748989381533e-05, "loss": 0.8225, "num_input_tokens_seen": 4980256, "step": 8640 }, { "epoch": 1.287607983318439, "grad_norm": 0.6237776875495911, "learning_rate": 4.98742487645865e-05, "loss": 0.8031, "num_input_tokens_seen": 4983072, "step": 8645 }, { "epoch": 1.2883526958593983, "grad_norm": 0.6715023517608643, "learning_rate": 4.987359691011909e-05, "loss": 0.8182, "num_input_tokens_seen": 4985856, "step": 8650 }, { "epoch": 1.2890974084003575, "grad_norm": 0.6999937295913696, "learning_rate": 4.987294337479513e-05, "loss": 0.8321, "num_input_tokens_seen": 4988896, "step": 8655 }, { "epoch": 1.2898421209413167, "grad_norm": 0.7599250078201294, "learning_rate": 4.987228815865879e-05, "loss": 0.8391, "num_input_tokens_seen": 4991744, "step": 8660 }, { "epoch": 1.2905868334822759, "grad_norm": 0.6858123540878296, "learning_rate": 4.987163126175434e-05, "loss": 0.8068, "num_input_tokens_seen": 4994624, "step": 8665 }, { "epoch": 1.291331546023235, "grad_norm": 0.6062481999397278, "learning_rate": 4.987097268412616e-05, "loss": 0.8161, "num_input_tokens_seen": 4997600, "step": 8670 }, { "epoch": 1.2920762585641943, "grad_norm": 0.564129114151001, "learning_rate": 4.987031242581877e-05, "loss": 0.7662, "num_input_tokens_seen": 5000544, "step": 8675 }, { "epoch": 1.2928209711051535, "grad_norm": 0.5970856547355652, "learning_rate": 4.9869650486876786e-05, "loss": 0.7843, "num_input_tokens_seen": 5003616, "step": 8680 }, { "epoch": 1.2935656836461127, "grad_norm": 0.624262273311615, "learning_rate": 4.986898686734493e-05, "loss": 0.8298, "num_input_tokens_seen": 5006400, "step": 8685 }, { "epoch": 1.2943103961870719, "grad_norm": 0.5362256169319153, "learning_rate": 4.9868321567268043e-05, "loss": 0.8162, "num_input_tokens_seen": 5008832, "step": 8690 }, { "epoch": 1.295055108728031, "grad_norm": 0.4945752024650574, "learning_rate": 4.98676545866911e-05, "loss": 0.803, "num_input_tokens_seen": 5011776, "step": 8695 }, { "epoch": 1.2957998212689903, "grad_norm": 0.675841212272644, "learning_rate": 4.986698592565917e-05, "loss": 0.8183, "num_input_tokens_seen": 5014560, "step": 8700 }, { "epoch": 1.2965445338099495, "grad_norm": 0.6117100119590759, "learning_rate": 4.986631558421742e-05, "loss": 0.8171, "num_input_tokens_seen": 5016992, "step": 8705 }, { "epoch": 1.2972892463509085, "grad_norm": 0.541395902633667, "learning_rate": 4.986564356241117e-05, "loss": 0.8181, "num_input_tokens_seen": 5019904, "step": 8710 }, { "epoch": 1.2980339588918677, "grad_norm": 0.7314924001693726, "learning_rate": 4.986496986028583e-05, "loss": 0.8405, "num_input_tokens_seen": 5022752, "step": 8715 }, { "epoch": 1.2987786714328269, "grad_norm": 0.5725283026695251, "learning_rate": 4.986429447788691e-05, "loss": 0.8404, "num_input_tokens_seen": 5025408, "step": 8720 }, { "epoch": 1.299523383973786, "grad_norm": 0.566034734249115, "learning_rate": 4.986361741526006e-05, "loss": 0.8215, "num_input_tokens_seen": 5028576, "step": 8725 }, { "epoch": 1.3002680965147453, "grad_norm": 0.6684859395027161, "learning_rate": 4.9862938672451045e-05, "loss": 0.7707, "num_input_tokens_seen": 5031328, "step": 8730 }, { "epoch": 1.3010128090557045, "grad_norm": 0.5242829322814941, "learning_rate": 4.986225824950571e-05, "loss": 0.8284, "num_input_tokens_seen": 5034400, "step": 8735 }, { "epoch": 1.3017575215966637, "grad_norm": 0.5023152828216553, "learning_rate": 4.986157614647005e-05, "loss": 0.78, "num_input_tokens_seen": 5037344, "step": 8740 }, { "epoch": 1.302502234137623, "grad_norm": 0.5047112703323364, "learning_rate": 4.9860892363390145e-05, "loss": 0.8031, "num_input_tokens_seen": 5040128, "step": 8745 }, { "epoch": 1.303246946678582, "grad_norm": 0.6167760491371155, "learning_rate": 4.986020690031221e-05, "loss": 0.8201, "num_input_tokens_seen": 5042944, "step": 8750 }, { "epoch": 1.3039916592195413, "grad_norm": 0.5564603805541992, "learning_rate": 4.985951975728258e-05, "loss": 0.8072, "num_input_tokens_seen": 5045664, "step": 8755 }, { "epoch": 1.3047363717605005, "grad_norm": 0.5625122785568237, "learning_rate": 4.9858830934347665e-05, "loss": 0.7958, "num_input_tokens_seen": 5048416, "step": 8760 }, { "epoch": 1.3054810843014597, "grad_norm": 0.6045095324516296, "learning_rate": 4.9858140431554036e-05, "loss": 0.8125, "num_input_tokens_seen": 5051264, "step": 8765 }, { "epoch": 1.306225796842419, "grad_norm": 0.8740713000297546, "learning_rate": 4.9857448248948336e-05, "loss": 0.8485, "num_input_tokens_seen": 5054048, "step": 8770 }, { "epoch": 1.3069705093833779, "grad_norm": 0.5002163648605347, "learning_rate": 4.985675438657734e-05, "loss": 0.8347, "num_input_tokens_seen": 5056544, "step": 8775 }, { "epoch": 1.307715221924337, "grad_norm": 0.519096851348877, "learning_rate": 4.985605884448795e-05, "loss": 0.8117, "num_input_tokens_seen": 5059296, "step": 8780 }, { "epoch": 1.3084599344652963, "grad_norm": 0.5375208854675293, "learning_rate": 4.985536162272716e-05, "loss": 0.8329, "num_input_tokens_seen": 5061952, "step": 8785 }, { "epoch": 1.3092046470062555, "grad_norm": 0.683081865310669, "learning_rate": 4.9854662721342086e-05, "loss": 0.7953, "num_input_tokens_seen": 5064800, "step": 8790 }, { "epoch": 1.3099493595472147, "grad_norm": 0.48134344816207886, "learning_rate": 4.985396214037995e-05, "loss": 0.8719, "num_input_tokens_seen": 5067584, "step": 8795 }, { "epoch": 1.310694072088174, "grad_norm": 0.4522058665752411, "learning_rate": 4.9853259879888116e-05, "loss": 0.8579, "num_input_tokens_seen": 5070592, "step": 8800 }, { "epoch": 1.311438784629133, "grad_norm": 0.5816574692726135, "learning_rate": 4.9852555939914014e-05, "loss": 0.8323, "num_input_tokens_seen": 5073504, "step": 8805 }, { "epoch": 1.3121834971700923, "grad_norm": 0.7014254927635193, "learning_rate": 4.9851850320505225e-05, "loss": 0.797, "num_input_tokens_seen": 5076384, "step": 8810 }, { "epoch": 1.3129282097110515, "grad_norm": 0.8483842015266418, "learning_rate": 4.985114302170943e-05, "loss": 0.854, "num_input_tokens_seen": 5079360, "step": 8815 }, { "epoch": 1.3136729222520107, "grad_norm": 0.5262211561203003, "learning_rate": 4.985043404357444e-05, "loss": 0.8346, "num_input_tokens_seen": 5081952, "step": 8820 }, { "epoch": 1.31441763479297, "grad_norm": 0.5208064913749695, "learning_rate": 4.984972338614814e-05, "loss": 0.8127, "num_input_tokens_seen": 5085024, "step": 8825 }, { "epoch": 1.3151623473339291, "grad_norm": 0.5543586015701294, "learning_rate": 4.984901104947857e-05, "loss": 0.8001, "num_input_tokens_seen": 5087584, "step": 8830 }, { "epoch": 1.3159070598748883, "grad_norm": 0.7406814098358154, "learning_rate": 4.984829703361386e-05, "loss": 0.7705, "num_input_tokens_seen": 5090464, "step": 8835 }, { "epoch": 1.3166517724158475, "grad_norm": 0.5070358514785767, "learning_rate": 4.984758133860227e-05, "loss": 0.8001, "num_input_tokens_seen": 5093376, "step": 8840 }, { "epoch": 1.3173964849568067, "grad_norm": 0.4612013101577759, "learning_rate": 4.984686396449214e-05, "loss": 0.8484, "num_input_tokens_seen": 5096416, "step": 8845 }, { "epoch": 1.318141197497766, "grad_norm": 0.6692695021629333, "learning_rate": 4.984614491133197e-05, "loss": 0.7838, "num_input_tokens_seen": 5099328, "step": 8850 }, { "epoch": 1.3188859100387251, "grad_norm": 0.6500869989395142, "learning_rate": 4.984542417917035e-05, "loss": 0.8175, "num_input_tokens_seen": 5102112, "step": 8855 }, { "epoch": 1.3196306225796843, "grad_norm": 0.6213661432266235, "learning_rate": 4.984470176805598e-05, "loss": 0.7975, "num_input_tokens_seen": 5104896, "step": 8860 }, { "epoch": 1.3203753351206435, "grad_norm": 0.49959200620651245, "learning_rate": 4.9843977678037666e-05, "loss": 0.8034, "num_input_tokens_seen": 5107648, "step": 8865 }, { "epoch": 1.3211200476616027, "grad_norm": 0.5716552138328552, "learning_rate": 4.984325190916435e-05, "loss": 0.7981, "num_input_tokens_seen": 5110432, "step": 8870 }, { "epoch": 1.321864760202562, "grad_norm": 0.6130920052528381, "learning_rate": 4.984252446148508e-05, "loss": 0.8454, "num_input_tokens_seen": 5113472, "step": 8875 }, { "epoch": 1.322609472743521, "grad_norm": 0.5416496396064758, "learning_rate": 4.9841795335049006e-05, "loss": 0.7871, "num_input_tokens_seen": 5116416, "step": 8880 }, { "epoch": 1.3233541852844801, "grad_norm": 0.6183486580848694, "learning_rate": 4.98410645299054e-05, "loss": 0.8204, "num_input_tokens_seen": 5119168, "step": 8885 }, { "epoch": 1.3240988978254393, "grad_norm": 0.4513007402420044, "learning_rate": 4.9840332046103656e-05, "loss": 0.8084, "num_input_tokens_seen": 5121696, "step": 8890 }, { "epoch": 1.3248436103663985, "grad_norm": 0.5060774087905884, "learning_rate": 4.9839597883693267e-05, "loss": 0.8367, "num_input_tokens_seen": 5124704, "step": 8895 }, { "epoch": 1.3255883229073577, "grad_norm": 0.7332743406295776, "learning_rate": 4.983886204272383e-05, "loss": 0.8647, "num_input_tokens_seen": 5127520, "step": 8900 }, { "epoch": 1.326333035448317, "grad_norm": 0.5077376961708069, "learning_rate": 4.98381245232451e-05, "loss": 0.7985, "num_input_tokens_seen": 5130176, "step": 8905 }, { "epoch": 1.3270777479892761, "grad_norm": 0.6757739782333374, "learning_rate": 4.98373853253069e-05, "loss": 0.7895, "num_input_tokens_seen": 5133152, "step": 8910 }, { "epoch": 1.3278224605302353, "grad_norm": 0.5670937895774841, "learning_rate": 4.983664444895917e-05, "loss": 0.8498, "num_input_tokens_seen": 5136064, "step": 8915 }, { "epoch": 1.3285671730711945, "grad_norm": 0.8432490825653076, "learning_rate": 4.983590189425198e-05, "loss": 0.8258, "num_input_tokens_seen": 5138848, "step": 8920 }, { "epoch": 1.3293118856121537, "grad_norm": 0.6609945297241211, "learning_rate": 4.9835157661235534e-05, "loss": 0.8396, "num_input_tokens_seen": 5142080, "step": 8925 }, { "epoch": 1.330056598153113, "grad_norm": 0.7133399844169617, "learning_rate": 4.98344117499601e-05, "loss": 0.839, "num_input_tokens_seen": 5145152, "step": 8930 }, { "epoch": 1.3308013106940721, "grad_norm": 0.6291927099227905, "learning_rate": 4.983366416047608e-05, "loss": 0.7882, "num_input_tokens_seen": 5147936, "step": 8935 }, { "epoch": 1.3315460232350314, "grad_norm": 0.666543185710907, "learning_rate": 4.983291489283401e-05, "loss": 0.7959, "num_input_tokens_seen": 5150944, "step": 8940 }, { "epoch": 1.3322907357759903, "grad_norm": 0.4703145921230316, "learning_rate": 4.983216394708451e-05, "loss": 0.785, "num_input_tokens_seen": 5153920, "step": 8945 }, { "epoch": 1.3330354483169495, "grad_norm": 0.6563631296157837, "learning_rate": 4.983141132327833e-05, "loss": 0.7785, "num_input_tokens_seen": 5156864, "step": 8950 }, { "epoch": 1.3337801608579087, "grad_norm": 0.7717398405075073, "learning_rate": 4.983065702146634e-05, "loss": 0.809, "num_input_tokens_seen": 5159360, "step": 8955 }, { "epoch": 1.334524873398868, "grad_norm": 0.5806555151939392, "learning_rate": 4.98299010416995e-05, "loss": 0.7804, "num_input_tokens_seen": 5161952, "step": 8960 }, { "epoch": 1.3352695859398271, "grad_norm": 1.0464624166488647, "learning_rate": 4.982914338402889e-05, "loss": 0.8079, "num_input_tokens_seen": 5164896, "step": 8965 }, { "epoch": 1.3360142984807863, "grad_norm": 0.559657633304596, "learning_rate": 4.982838404850573e-05, "loss": 0.7501, "num_input_tokens_seen": 5167680, "step": 8970 }, { "epoch": 1.3367590110217455, "grad_norm": 0.8008733987808228, "learning_rate": 4.982762303518131e-05, "loss": 0.8969, "num_input_tokens_seen": 5170944, "step": 8975 }, { "epoch": 1.3375037235627047, "grad_norm": 0.6923806071281433, "learning_rate": 4.982686034410707e-05, "loss": 0.7823, "num_input_tokens_seen": 5173824, "step": 8980 }, { "epoch": 1.338248436103664, "grad_norm": 0.5279638171195984, "learning_rate": 4.982609597533455e-05, "loss": 0.8039, "num_input_tokens_seen": 5176640, "step": 8985 }, { "epoch": 1.3389931486446232, "grad_norm": 0.8732529282569885, "learning_rate": 4.98253299289154e-05, "loss": 0.8423, "num_input_tokens_seen": 5179424, "step": 8990 }, { "epoch": 1.3397378611855824, "grad_norm": 0.7313226461410522, "learning_rate": 4.982456220490138e-05, "loss": 0.9463, "num_input_tokens_seen": 5182592, "step": 8995 }, { "epoch": 1.3404825737265416, "grad_norm": 0.5059576034545898, "learning_rate": 4.982379280334438e-05, "loss": 0.8352, "num_input_tokens_seen": 5185376, "step": 9000 }, { "epoch": 1.3412272862675008, "grad_norm": 0.7300413846969604, "learning_rate": 4.982302172429638e-05, "loss": 0.7743, "num_input_tokens_seen": 5188288, "step": 9005 }, { "epoch": 1.34197199880846, "grad_norm": 0.61313396692276, "learning_rate": 4.98222489678095e-05, "loss": 0.8298, "num_input_tokens_seen": 5191264, "step": 9010 }, { "epoch": 1.3427167113494192, "grad_norm": 0.5816280841827393, "learning_rate": 4.9821474533935966e-05, "loss": 0.9258, "num_input_tokens_seen": 5195488, "step": 9015 }, { "epoch": 1.3434614238903784, "grad_norm": 0.6247066259384155, "learning_rate": 4.982069842272809e-05, "loss": 0.8035, "num_input_tokens_seen": 5198112, "step": 9020 }, { "epoch": 1.3442061364313376, "grad_norm": 0.6739490628242493, "learning_rate": 4.9819920634238323e-05, "loss": 0.7978, "num_input_tokens_seen": 5201024, "step": 9025 }, { "epoch": 1.3449508489722968, "grad_norm": 0.6011804938316345, "learning_rate": 4.981914116851924e-05, "loss": 0.8494, "num_input_tokens_seen": 5203968, "step": 9030 }, { "epoch": 1.345695561513256, "grad_norm": 0.6009278297424316, "learning_rate": 4.98183600256235e-05, "loss": 0.8277, "num_input_tokens_seen": 5206976, "step": 9035 }, { "epoch": 1.3464402740542152, "grad_norm": 0.5459724068641663, "learning_rate": 4.981757720560389e-05, "loss": 0.8162, "num_input_tokens_seen": 5210176, "step": 9040 }, { "epoch": 1.3471849865951744, "grad_norm": 0.8239949941635132, "learning_rate": 4.981679270851332e-05, "loss": 0.8233, "num_input_tokens_seen": 5213024, "step": 9045 }, { "epoch": 1.3479296991361336, "grad_norm": 0.5705427527427673, "learning_rate": 4.981600653440479e-05, "loss": 0.781, "num_input_tokens_seen": 5216064, "step": 9050 }, { "epoch": 1.3486744116770926, "grad_norm": 0.6002869009971619, "learning_rate": 4.981521868333144e-05, "loss": 0.842, "num_input_tokens_seen": 5219200, "step": 9055 }, { "epoch": 1.3494191242180518, "grad_norm": 0.5669281482696533, "learning_rate": 4.98144291553465e-05, "loss": 0.8239, "num_input_tokens_seen": 5222336, "step": 9060 }, { "epoch": 1.350163836759011, "grad_norm": 0.5451793074607849, "learning_rate": 4.981363795050332e-05, "loss": 0.8166, "num_input_tokens_seen": 5225248, "step": 9065 }, { "epoch": 1.3509085492999702, "grad_norm": 0.5882170796394348, "learning_rate": 4.9812845068855384e-05, "loss": 0.7957, "num_input_tokens_seen": 5228320, "step": 9070 }, { "epoch": 1.3516532618409294, "grad_norm": 0.6295026540756226, "learning_rate": 4.9812050510456254e-05, "loss": 0.82, "num_input_tokens_seen": 5231008, "step": 9075 }, { "epoch": 1.3523979743818886, "grad_norm": 0.6747910976409912, "learning_rate": 4.9811254275359626e-05, "loss": 0.7972, "num_input_tokens_seen": 5233792, "step": 9080 }, { "epoch": 1.3531426869228478, "grad_norm": 0.5102699995040894, "learning_rate": 4.9810456363619304e-05, "loss": 0.8154, "num_input_tokens_seen": 5236768, "step": 9085 }, { "epoch": 1.353887399463807, "grad_norm": 1.0029300451278687, "learning_rate": 4.980965677528923e-05, "loss": 0.8436, "num_input_tokens_seen": 5239936, "step": 9090 }, { "epoch": 1.3546321120047662, "grad_norm": 0.6164464950561523, "learning_rate": 4.98088555104234e-05, "loss": 0.8254, "num_input_tokens_seen": 5242624, "step": 9095 }, { "epoch": 1.3553768245457254, "grad_norm": 0.49462127685546875, "learning_rate": 4.9808052569076e-05, "loss": 0.7872, "num_input_tokens_seen": 5245344, "step": 9100 }, { "epoch": 1.3561215370866846, "grad_norm": 0.6016001105308533, "learning_rate": 4.9807247951301255e-05, "loss": 0.8206, "num_input_tokens_seen": 5248352, "step": 9105 }, { "epoch": 1.3568662496276438, "grad_norm": 0.7100877165794373, "learning_rate": 4.9806441657153555e-05, "loss": 0.8515, "num_input_tokens_seen": 5251328, "step": 9110 }, { "epoch": 1.357610962168603, "grad_norm": 0.47385722398757935, "learning_rate": 4.9805633686687394e-05, "loss": 0.8369, "num_input_tokens_seen": 5253952, "step": 9115 }, { "epoch": 1.358355674709562, "grad_norm": 0.6037843823432922, "learning_rate": 4.980482403995734e-05, "loss": 0.8248, "num_input_tokens_seen": 5256736, "step": 9120 }, { "epoch": 1.3591003872505212, "grad_norm": 0.5344014167785645, "learning_rate": 4.9804012717018146e-05, "loss": 0.8542, "num_input_tokens_seen": 5259808, "step": 9125 }, { "epoch": 1.3598450997914804, "grad_norm": 0.6037695407867432, "learning_rate": 4.980319971792461e-05, "loss": 0.7787, "num_input_tokens_seen": 5263104, "step": 9130 }, { "epoch": 1.3605898123324396, "grad_norm": 0.5460473895072937, "learning_rate": 4.980238504273168e-05, "loss": 0.8276, "num_input_tokens_seen": 5265696, "step": 9135 }, { "epoch": 1.3613345248733988, "grad_norm": 0.6374845504760742, "learning_rate": 4.98015686914944e-05, "loss": 0.8313, "num_input_tokens_seen": 5268416, "step": 9140 }, { "epoch": 1.362079237414358, "grad_norm": 0.5469788312911987, "learning_rate": 4.980075066426796e-05, "loss": 0.791, "num_input_tokens_seen": 5271744, "step": 9145 }, { "epoch": 1.3628239499553172, "grad_norm": 0.5434243083000183, "learning_rate": 4.979993096110762e-05, "loss": 0.8022, "num_input_tokens_seen": 5274784, "step": 9150 }, { "epoch": 1.3635686624962764, "grad_norm": 0.5097401738166809, "learning_rate": 4.979910958206876e-05, "loss": 0.8406, "num_input_tokens_seen": 5277472, "step": 9155 }, { "epoch": 1.3643133750372356, "grad_norm": 0.7316239476203918, "learning_rate": 4.9798286527206915e-05, "loss": 0.8162, "num_input_tokens_seen": 5280160, "step": 9160 }, { "epoch": 1.3650580875781948, "grad_norm": 0.5755894184112549, "learning_rate": 4.979746179657768e-05, "loss": 0.8442, "num_input_tokens_seen": 5283200, "step": 9165 }, { "epoch": 1.365802800119154, "grad_norm": 0.6218770146369934, "learning_rate": 4.97966353902368e-05, "loss": 0.7714, "num_input_tokens_seen": 5286432, "step": 9170 }, { "epoch": 1.3665475126601132, "grad_norm": 0.672437310218811, "learning_rate": 4.9795807308240115e-05, "loss": 0.8403, "num_input_tokens_seen": 5289248, "step": 9175 }, { "epoch": 1.3672922252010724, "grad_norm": 0.6801339983940125, "learning_rate": 4.979497755064359e-05, "loss": 0.7566, "num_input_tokens_seen": 5292064, "step": 9180 }, { "epoch": 1.3680369377420316, "grad_norm": 0.7244670987129211, "learning_rate": 4.979414611750329e-05, "loss": 0.8372, "num_input_tokens_seen": 5294848, "step": 9185 }, { "epoch": 1.3687816502829908, "grad_norm": 0.5655310153961182, "learning_rate": 4.97933130088754e-05, "loss": 0.8504, "num_input_tokens_seen": 5297472, "step": 9190 }, { "epoch": 1.36952636282395, "grad_norm": 0.6101661324501038, "learning_rate": 4.9792478224816206e-05, "loss": 0.7903, "num_input_tokens_seen": 5300320, "step": 9195 }, { "epoch": 1.3702710753649092, "grad_norm": 0.5893477201461792, "learning_rate": 4.979164176538215e-05, "loss": 0.7413, "num_input_tokens_seen": 5303104, "step": 9200 }, { "epoch": 1.3710157879058684, "grad_norm": 0.4631311893463135, "learning_rate": 4.979080363062974e-05, "loss": 0.7863, "num_input_tokens_seen": 5306048, "step": 9205 }, { "epoch": 1.3717605004468276, "grad_norm": 0.5738562345504761, "learning_rate": 4.978996382061559e-05, "loss": 0.8118, "num_input_tokens_seen": 5308992, "step": 9210 }, { "epoch": 1.3725052129877868, "grad_norm": 0.8125275373458862, "learning_rate": 4.978912233539649e-05, "loss": 0.8291, "num_input_tokens_seen": 5311872, "step": 9215 }, { "epoch": 1.373249925528746, "grad_norm": 0.5495816469192505, "learning_rate": 4.978827917502929e-05, "loss": 0.822, "num_input_tokens_seen": 5314592, "step": 9220 }, { "epoch": 1.3739946380697052, "grad_norm": 0.5354670286178589, "learning_rate": 4.978743433957096e-05, "loss": 0.7747, "num_input_tokens_seen": 5317536, "step": 9225 }, { "epoch": 1.3747393506106642, "grad_norm": 0.4915149509906769, "learning_rate": 4.97865878290786e-05, "loss": 0.8591, "num_input_tokens_seen": 5320640, "step": 9230 }, { "epoch": 1.3754840631516234, "grad_norm": 0.529897928237915, "learning_rate": 4.9785739643609406e-05, "loss": 0.7491, "num_input_tokens_seen": 5323552, "step": 9235 }, { "epoch": 1.3762287756925826, "grad_norm": 0.3833126127719879, "learning_rate": 4.97848897832207e-05, "loss": 0.7791, "num_input_tokens_seen": 5326400, "step": 9240 }, { "epoch": 1.3769734882335418, "grad_norm": 0.6137766242027283, "learning_rate": 4.978403824796991e-05, "loss": 0.8343, "num_input_tokens_seen": 5329376, "step": 9245 }, { "epoch": 1.377718200774501, "grad_norm": 0.511597216129303, "learning_rate": 4.978318503791458e-05, "loss": 0.8281, "num_input_tokens_seen": 5332448, "step": 9250 }, { "epoch": 1.3784629133154602, "grad_norm": 0.5176748037338257, "learning_rate": 4.978233015311236e-05, "loss": 0.8607, "num_input_tokens_seen": 5335136, "step": 9255 }, { "epoch": 1.3792076258564194, "grad_norm": 0.5715530514717102, "learning_rate": 4.978147359362103e-05, "loss": 0.8585, "num_input_tokens_seen": 5338112, "step": 9260 }, { "epoch": 1.3799523383973786, "grad_norm": 0.44185128808021545, "learning_rate": 4.978061535949847e-05, "loss": 0.7628, "num_input_tokens_seen": 5341056, "step": 9265 }, { "epoch": 1.3806970509383378, "grad_norm": 0.497566819190979, "learning_rate": 4.9779755450802675e-05, "loss": 0.7952, "num_input_tokens_seen": 5344128, "step": 9270 }, { "epoch": 1.381441763479297, "grad_norm": 0.7580606341362, "learning_rate": 4.977889386759176e-05, "loss": 0.8476, "num_input_tokens_seen": 5346848, "step": 9275 }, { "epoch": 1.3821864760202562, "grad_norm": 0.5981879234313965, "learning_rate": 4.977803060992393e-05, "loss": 0.841, "num_input_tokens_seen": 5349856, "step": 9280 }, { "epoch": 1.3829311885612154, "grad_norm": 0.6430658102035522, "learning_rate": 4.977716567785754e-05, "loss": 0.7921, "num_input_tokens_seen": 5352544, "step": 9285 }, { "epoch": 1.3836759011021746, "grad_norm": 0.4962419271469116, "learning_rate": 4.977629907145102e-05, "loss": 0.8006, "num_input_tokens_seen": 5355392, "step": 9290 }, { "epoch": 1.3844206136431336, "grad_norm": 0.5322789549827576, "learning_rate": 4.977543079076295e-05, "loss": 0.7895, "num_input_tokens_seen": 5358112, "step": 9295 }, { "epoch": 1.3851653261840928, "grad_norm": 0.5168625712394714, "learning_rate": 4.977456083585199e-05, "loss": 0.8104, "num_input_tokens_seen": 5360960, "step": 9300 }, { "epoch": 1.385910038725052, "grad_norm": 0.5632907152175903, "learning_rate": 4.977368920677694e-05, "loss": 0.8174, "num_input_tokens_seen": 5363616, "step": 9305 }, { "epoch": 1.3866547512660112, "grad_norm": 0.5511189103126526, "learning_rate": 4.97728159035967e-05, "loss": 0.7975, "num_input_tokens_seen": 5366240, "step": 9310 }, { "epoch": 1.3873994638069704, "grad_norm": 0.6582174897193909, "learning_rate": 4.9771940926370274e-05, "loss": 0.8163, "num_input_tokens_seen": 5368768, "step": 9315 }, { "epoch": 1.3881441763479296, "grad_norm": 0.5406380295753479, "learning_rate": 4.97710642751568e-05, "loss": 0.7869, "num_input_tokens_seen": 5371584, "step": 9320 }, { "epoch": 1.3888888888888888, "grad_norm": 0.6411635279655457, "learning_rate": 4.977018595001551e-05, "loss": 0.8261, "num_input_tokens_seen": 5374400, "step": 9325 }, { "epoch": 1.389633601429848, "grad_norm": 0.6864987015724182, "learning_rate": 4.9769305951005766e-05, "loss": 0.8284, "num_input_tokens_seen": 5377280, "step": 9330 }, { "epoch": 1.3903783139708072, "grad_norm": 0.5777857899665833, "learning_rate": 4.976842427818702e-05, "loss": 0.8465, "num_input_tokens_seen": 5380032, "step": 9335 }, { "epoch": 1.3911230265117664, "grad_norm": 0.6675447821617126, "learning_rate": 4.9767540931618874e-05, "loss": 0.7936, "num_input_tokens_seen": 5383200, "step": 9340 }, { "epoch": 1.3918677390527256, "grad_norm": 0.6160411834716797, "learning_rate": 4.9766655911361e-05, "loss": 0.8264, "num_input_tokens_seen": 5386336, "step": 9345 }, { "epoch": 1.3926124515936849, "grad_norm": 0.5715731382369995, "learning_rate": 4.976576921747322e-05, "loss": 0.827, "num_input_tokens_seen": 5389120, "step": 9350 }, { "epoch": 1.393357164134644, "grad_norm": 0.5737208724021912, "learning_rate": 4.976488085001545e-05, "loss": 0.859, "num_input_tokens_seen": 5392064, "step": 9355 }, { "epoch": 1.3941018766756033, "grad_norm": 0.7853167057037354, "learning_rate": 4.976399080904771e-05, "loss": 0.8142, "num_input_tokens_seen": 5394880, "step": 9360 }, { "epoch": 1.3948465892165625, "grad_norm": 0.767599880695343, "learning_rate": 4.9763099094630164e-05, "loss": 0.8427, "num_input_tokens_seen": 5397696, "step": 9365 }, { "epoch": 1.3955913017575217, "grad_norm": 0.45824846625328064, "learning_rate": 4.976220570682305e-05, "loss": 0.796, "num_input_tokens_seen": 5400480, "step": 9370 }, { "epoch": 1.3963360142984809, "grad_norm": 0.532163143157959, "learning_rate": 4.976131064568675e-05, "loss": 0.8259, "num_input_tokens_seen": 5403488, "step": 9375 }, { "epoch": 1.39708072683944, "grad_norm": 0.6342623233795166, "learning_rate": 4.976041391128175e-05, "loss": 0.8351, "num_input_tokens_seen": 5406240, "step": 9380 }, { "epoch": 1.3978254393803993, "grad_norm": 0.59047931432724, "learning_rate": 4.975951550366866e-05, "loss": 0.7938, "num_input_tokens_seen": 5409216, "step": 9385 }, { "epoch": 1.3985701519213585, "grad_norm": 0.4791274666786194, "learning_rate": 4.9758615422908164e-05, "loss": 0.7891, "num_input_tokens_seen": 5412064, "step": 9390 }, { "epoch": 1.3993148644623177, "grad_norm": 0.6690757274627686, "learning_rate": 4.97577136690611e-05, "loss": 0.7863, "num_input_tokens_seen": 5415104, "step": 9395 }, { "epoch": 1.4000595770032767, "grad_norm": 0.614340603351593, "learning_rate": 4.975681024218841e-05, "loss": 0.7634, "num_input_tokens_seen": 5418048, "step": 9400 }, { "epoch": 1.4008042895442359, "grad_norm": 0.4101216495037079, "learning_rate": 4.9755905142351133e-05, "loss": 0.7943, "num_input_tokens_seen": 5421024, "step": 9405 }, { "epoch": 1.401549002085195, "grad_norm": 0.531609833240509, "learning_rate": 4.975499836961044e-05, "loss": 0.8418, "num_input_tokens_seen": 5423616, "step": 9410 }, { "epoch": 1.4022937146261543, "grad_norm": 0.5556610822677612, "learning_rate": 4.97540899240276e-05, "loss": 0.7579, "num_input_tokens_seen": 5426368, "step": 9415 }, { "epoch": 1.4030384271671135, "grad_norm": 0.6945252418518066, "learning_rate": 4.9753179805664e-05, "loss": 0.8158, "num_input_tokens_seen": 5429120, "step": 9420 }, { "epoch": 1.4037831397080727, "grad_norm": 0.8120536804199219, "learning_rate": 4.975226801458116e-05, "loss": 0.9099, "num_input_tokens_seen": 5431936, "step": 9425 }, { "epoch": 1.4045278522490319, "grad_norm": 0.629226803779602, "learning_rate": 4.975135455084067e-05, "loss": 0.8004, "num_input_tokens_seen": 5434816, "step": 9430 }, { "epoch": 1.405272564789991, "grad_norm": 0.546386182308197, "learning_rate": 4.975043941450428e-05, "loss": 0.8082, "num_input_tokens_seen": 5437632, "step": 9435 }, { "epoch": 1.4060172773309503, "grad_norm": 0.6008752584457397, "learning_rate": 4.9749522605633825e-05, "loss": 0.7976, "num_input_tokens_seen": 5440352, "step": 9440 }, { "epoch": 1.4067619898719095, "grad_norm": 0.6814506649971008, "learning_rate": 4.9748604124291254e-05, "loss": 0.8121, "num_input_tokens_seen": 5443424, "step": 9445 }, { "epoch": 1.4075067024128687, "grad_norm": 0.4542296528816223, "learning_rate": 4.974768397053863e-05, "loss": 0.8133, "num_input_tokens_seen": 5446272, "step": 9450 }, { "epoch": 1.4082514149538279, "grad_norm": 0.668889582157135, "learning_rate": 4.9746762144438144e-05, "loss": 0.8013, "num_input_tokens_seen": 5449344, "step": 9455 }, { "epoch": 1.408996127494787, "grad_norm": 0.5469821691513062, "learning_rate": 4.974583864605209e-05, "loss": 0.7672, "num_input_tokens_seen": 5452544, "step": 9460 }, { "epoch": 1.409740840035746, "grad_norm": 0.6492924690246582, "learning_rate": 4.974491347544287e-05, "loss": 0.8036, "num_input_tokens_seen": 5455200, "step": 9465 }, { "epoch": 1.4104855525767053, "grad_norm": 0.5924614071846008, "learning_rate": 4.974398663267299e-05, "loss": 0.8614, "num_input_tokens_seen": 5457760, "step": 9470 }, { "epoch": 1.4112302651176645, "grad_norm": 0.44229820370674133, "learning_rate": 4.9743058117805105e-05, "loss": 0.8015, "num_input_tokens_seen": 5460576, "step": 9475 }, { "epoch": 1.4119749776586237, "grad_norm": 0.571307897567749, "learning_rate": 4.974212793090195e-05, "loss": 0.7607, "num_input_tokens_seen": 5463456, "step": 9480 }, { "epoch": 1.4127196901995829, "grad_norm": 0.42783409357070923, "learning_rate": 4.974119607202638e-05, "loss": 0.7655, "num_input_tokens_seen": 5466208, "step": 9485 }, { "epoch": 1.413464402740542, "grad_norm": 0.5592432022094727, "learning_rate": 4.974026254124138e-05, "loss": 0.8001, "num_input_tokens_seen": 5469184, "step": 9490 }, { "epoch": 1.4142091152815013, "grad_norm": 0.6948234438896179, "learning_rate": 4.973932733861001e-05, "loss": 0.845, "num_input_tokens_seen": 5472096, "step": 9495 }, { "epoch": 1.4149538278224605, "grad_norm": 0.5125910043716431, "learning_rate": 4.9738390464195486e-05, "loss": 0.7868, "num_input_tokens_seen": 5474752, "step": 9500 }, { "epoch": 1.4156985403634197, "grad_norm": 0.6107876896858215, "learning_rate": 4.973745191806112e-05, "loss": 0.8643, "num_input_tokens_seen": 5477792, "step": 9505 }, { "epoch": 1.416443252904379, "grad_norm": 0.616986870765686, "learning_rate": 4.9736511700270324e-05, "loss": 0.808, "num_input_tokens_seen": 5480864, "step": 9510 }, { "epoch": 1.417187965445338, "grad_norm": 0.5692204236984253, "learning_rate": 4.973556981088664e-05, "loss": 0.786, "num_input_tokens_seen": 5483584, "step": 9515 }, { "epoch": 1.4179326779862973, "grad_norm": 0.5592240691184998, "learning_rate": 4.9734626249973715e-05, "loss": 0.8344, "num_input_tokens_seen": 5486368, "step": 9520 }, { "epoch": 1.4186773905272565, "grad_norm": 0.4656725525856018, "learning_rate": 4.973368101759531e-05, "loss": 0.8202, "num_input_tokens_seen": 5489344, "step": 9525 }, { "epoch": 1.4194221030682157, "grad_norm": 0.44155019521713257, "learning_rate": 4.97327341138153e-05, "loss": 0.8293, "num_input_tokens_seen": 5492416, "step": 9530 }, { "epoch": 1.420166815609175, "grad_norm": 0.600674033164978, "learning_rate": 4.973178553869767e-05, "loss": 0.8284, "num_input_tokens_seen": 5495520, "step": 9535 }, { "epoch": 1.420911528150134, "grad_norm": 0.5129657983779907, "learning_rate": 4.973083529230654e-05, "loss": 0.8092, "num_input_tokens_seen": 5498272, "step": 9540 }, { "epoch": 1.4216562406910933, "grad_norm": 0.43641260266304016, "learning_rate": 4.97298833747061e-05, "loss": 0.8522, "num_input_tokens_seen": 5501344, "step": 9545 }, { "epoch": 1.4224009532320525, "grad_norm": 0.5029006600379944, "learning_rate": 4.972892978596069e-05, "loss": 0.7645, "num_input_tokens_seen": 5504352, "step": 9550 }, { "epoch": 1.4231456657730117, "grad_norm": 0.43185189366340637, "learning_rate": 4.972797452613474e-05, "loss": 0.8181, "num_input_tokens_seen": 5507232, "step": 9555 }, { "epoch": 1.423890378313971, "grad_norm": 0.6916959285736084, "learning_rate": 4.972701759529281e-05, "loss": 0.8512, "num_input_tokens_seen": 5510208, "step": 9560 }, { "epoch": 1.4246350908549301, "grad_norm": 0.6768254637718201, "learning_rate": 4.972605899349957e-05, "loss": 0.8134, "num_input_tokens_seen": 5512896, "step": 9565 }, { "epoch": 1.4253798033958893, "grad_norm": 0.5654370784759521, "learning_rate": 4.9725098720819784e-05, "loss": 0.8321, "num_input_tokens_seen": 5515808, "step": 9570 }, { "epoch": 1.4261245159368483, "grad_norm": 0.8651344776153564, "learning_rate": 4.9724136777318354e-05, "loss": 0.7794, "num_input_tokens_seen": 5518752, "step": 9575 }, { "epoch": 1.4268692284778075, "grad_norm": 0.4040229022502899, "learning_rate": 4.972317316306028e-05, "loss": 0.8092, "num_input_tokens_seen": 5521632, "step": 9580 }, { "epoch": 1.4276139410187667, "grad_norm": 0.596074104309082, "learning_rate": 4.972220787811068e-05, "loss": 0.7684, "num_input_tokens_seen": 5524704, "step": 9585 }, { "epoch": 1.428358653559726, "grad_norm": 0.5308236479759216, "learning_rate": 4.972124092253479e-05, "loss": 0.7656, "num_input_tokens_seen": 5527584, "step": 9590 }, { "epoch": 1.4291033661006851, "grad_norm": 0.46697235107421875, "learning_rate": 4.9720272296397946e-05, "loss": 0.8235, "num_input_tokens_seen": 5530240, "step": 9595 }, { "epoch": 1.4298480786416443, "grad_norm": 0.7271003127098083, "learning_rate": 4.9719301999765605e-05, "loss": 0.8057, "num_input_tokens_seen": 5532992, "step": 9600 }, { "epoch": 1.4305927911826035, "grad_norm": 0.505186140537262, "learning_rate": 4.971833003270333e-05, "loss": 0.8369, "num_input_tokens_seen": 5535712, "step": 9605 }, { "epoch": 1.4313375037235627, "grad_norm": 0.6343916058540344, "learning_rate": 4.9717356395276814e-05, "loss": 0.8338, "num_input_tokens_seen": 5539168, "step": 9610 }, { "epoch": 1.432082216264522, "grad_norm": 0.5501371622085571, "learning_rate": 4.971638108755186e-05, "loss": 0.7856, "num_input_tokens_seen": 5542112, "step": 9615 }, { "epoch": 1.4328269288054811, "grad_norm": 0.5313968062400818, "learning_rate": 4.9715404109594347e-05, "loss": 0.8059, "num_input_tokens_seen": 5545248, "step": 9620 }, { "epoch": 1.4335716413464403, "grad_norm": 0.6828755140304565, "learning_rate": 4.971442546147031e-05, "loss": 0.843, "num_input_tokens_seen": 5548128, "step": 9625 }, { "epoch": 1.4343163538873995, "grad_norm": 0.5793970227241516, "learning_rate": 4.9713445143245876e-05, "loss": 0.8144, "num_input_tokens_seen": 5550720, "step": 9630 }, { "epoch": 1.4350610664283587, "grad_norm": 0.6556704044342041, "learning_rate": 4.9712463154987305e-05, "loss": 0.8064, "num_input_tokens_seen": 5553600, "step": 9635 }, { "epoch": 1.4358057789693177, "grad_norm": 0.5219808220863342, "learning_rate": 4.9711479496760947e-05, "loss": 0.855, "num_input_tokens_seen": 5556768, "step": 9640 }, { "epoch": 1.436550491510277, "grad_norm": 0.39152392745018005, "learning_rate": 4.971049416863327e-05, "loss": 0.8068, "num_input_tokens_seen": 5559648, "step": 9645 }, { "epoch": 1.4372952040512361, "grad_norm": 0.6524953246116638, "learning_rate": 4.9709507170670866e-05, "loss": 0.7717, "num_input_tokens_seen": 5562368, "step": 9650 }, { "epoch": 1.4380399165921953, "grad_norm": 0.9702962040901184, "learning_rate": 4.970851850294043e-05, "loss": 0.8201, "num_input_tokens_seen": 5565184, "step": 9655 }, { "epoch": 1.4387846291331545, "grad_norm": 0.5720992684364319, "learning_rate": 4.970752816550877e-05, "loss": 0.7717, "num_input_tokens_seen": 5568352, "step": 9660 }, { "epoch": 1.4395293416741137, "grad_norm": 0.6544692516326904, "learning_rate": 4.970653615844281e-05, "loss": 0.8384, "num_input_tokens_seen": 5571136, "step": 9665 }, { "epoch": 1.440274054215073, "grad_norm": 0.5365238785743713, "learning_rate": 4.970554248180959e-05, "loss": 0.7785, "num_input_tokens_seen": 5573952, "step": 9670 }, { "epoch": 1.4410187667560321, "grad_norm": 0.6162155270576477, "learning_rate": 4.970454713567625e-05, "loss": 0.8425, "num_input_tokens_seen": 5577056, "step": 9675 }, { "epoch": 1.4417634792969913, "grad_norm": 0.5350973010063171, "learning_rate": 4.970355012011005e-05, "loss": 0.741, "num_input_tokens_seen": 5579520, "step": 9680 }, { "epoch": 1.4425081918379505, "grad_norm": 0.6549529433250427, "learning_rate": 4.970255143517838e-05, "loss": 0.772, "num_input_tokens_seen": 5582496, "step": 9685 }, { "epoch": 1.4432529043789097, "grad_norm": 0.4653702676296234, "learning_rate": 4.9701551080948714e-05, "loss": 0.8191, "num_input_tokens_seen": 5585280, "step": 9690 }, { "epoch": 1.443997616919869, "grad_norm": 0.4805477261543274, "learning_rate": 4.970054905748865e-05, "loss": 0.7868, "num_input_tokens_seen": 5587872, "step": 9695 }, { "epoch": 1.4447423294608281, "grad_norm": 0.6062555909156799, "learning_rate": 4.969954536486592e-05, "loss": 0.8649, "num_input_tokens_seen": 5590784, "step": 9700 }, { "epoch": 1.4454870420017873, "grad_norm": 0.6060364246368408, "learning_rate": 4.969854000314833e-05, "loss": 0.8321, "num_input_tokens_seen": 5593408, "step": 9705 }, { "epoch": 1.4462317545427466, "grad_norm": 0.529956579208374, "learning_rate": 4.9697532972403816e-05, "loss": 0.7551, "num_input_tokens_seen": 5596448, "step": 9710 }, { "epoch": 1.4469764670837058, "grad_norm": 0.49017202854156494, "learning_rate": 4.969652427270044e-05, "loss": 0.8267, "num_input_tokens_seen": 5599232, "step": 9715 }, { "epoch": 1.447721179624665, "grad_norm": 0.44634872674942017, "learning_rate": 4.969551390410636e-05, "loss": 0.7807, "num_input_tokens_seen": 5601984, "step": 9720 }, { "epoch": 1.4484658921656242, "grad_norm": 0.48802512884140015, "learning_rate": 4.969450186668986e-05, "loss": 0.7961, "num_input_tokens_seen": 5604896, "step": 9725 }, { "epoch": 1.4492106047065834, "grad_norm": 0.5168665051460266, "learning_rate": 4.969348816051932e-05, "loss": 0.8646, "num_input_tokens_seen": 5608096, "step": 9730 }, { "epoch": 1.4499553172475426, "grad_norm": 0.4275498390197754, "learning_rate": 4.9692472785663244e-05, "loss": 0.8097, "num_input_tokens_seen": 5610912, "step": 9735 }, { "epoch": 1.4507000297885018, "grad_norm": 0.43951669335365295, "learning_rate": 4.9691455742190266e-05, "loss": 0.8302, "num_input_tokens_seen": 5614080, "step": 9740 }, { "epoch": 1.4514447423294607, "grad_norm": 0.37926316261291504, "learning_rate": 4.969043703016908e-05, "loss": 0.8432, "num_input_tokens_seen": 5616704, "step": 9745 }, { "epoch": 1.45218945487042, "grad_norm": 0.5523222088813782, "learning_rate": 4.9689416649668554e-05, "loss": 0.8148, "num_input_tokens_seen": 5619648, "step": 9750 }, { "epoch": 1.4529341674113792, "grad_norm": 0.4121673107147217, "learning_rate": 4.9688394600757624e-05, "loss": 0.804, "num_input_tokens_seen": 5622432, "step": 9755 }, { "epoch": 1.4536788799523384, "grad_norm": 0.5168840289115906, "learning_rate": 4.968737088350536e-05, "loss": 0.8133, "num_input_tokens_seen": 5624992, "step": 9760 }, { "epoch": 1.4544235924932976, "grad_norm": 0.7998860478401184, "learning_rate": 4.9686345497980945e-05, "loss": 0.8413, "num_input_tokens_seen": 5627872, "step": 9765 }, { "epoch": 1.4551683050342568, "grad_norm": 0.4423958361148834, "learning_rate": 4.968531844425367e-05, "loss": 0.7781, "num_input_tokens_seen": 5630688, "step": 9770 }, { "epoch": 1.455913017575216, "grad_norm": 0.4509164094924927, "learning_rate": 4.968428972239294e-05, "loss": 0.8227, "num_input_tokens_seen": 5633568, "step": 9775 }, { "epoch": 1.4566577301161752, "grad_norm": 0.5189763903617859, "learning_rate": 4.9683259332468265e-05, "loss": 0.858, "num_input_tokens_seen": 5636288, "step": 9780 }, { "epoch": 1.4574024426571344, "grad_norm": 0.49702152609825134, "learning_rate": 4.968222727454929e-05, "loss": 0.8008, "num_input_tokens_seen": 5639200, "step": 9785 }, { "epoch": 1.4581471551980936, "grad_norm": 0.593533456325531, "learning_rate": 4.9681193548705736e-05, "loss": 0.801, "num_input_tokens_seen": 5642176, "step": 9790 }, { "epoch": 1.4588918677390528, "grad_norm": 0.630622148513794, "learning_rate": 4.9680158155007474e-05, "loss": 0.8168, "num_input_tokens_seen": 5645472, "step": 9795 }, { "epoch": 1.459636580280012, "grad_norm": 0.5333378911018372, "learning_rate": 4.967912109352446e-05, "loss": 0.7946, "num_input_tokens_seen": 5648384, "step": 9800 }, { "epoch": 1.4603812928209712, "grad_norm": 0.5695834159851074, "learning_rate": 4.9678082364326786e-05, "loss": 0.7811, "num_input_tokens_seen": 5651296, "step": 9805 }, { "epoch": 1.4611260053619302, "grad_norm": 0.5108168125152588, "learning_rate": 4.9677041967484635e-05, "loss": 0.8418, "num_input_tokens_seen": 5654080, "step": 9810 }, { "epoch": 1.4618707179028894, "grad_norm": 0.595104455947876, "learning_rate": 4.967599990306832e-05, "loss": 0.8218, "num_input_tokens_seen": 5656608, "step": 9815 }, { "epoch": 1.4626154304438486, "grad_norm": 0.5179119110107422, "learning_rate": 4.967495617114826e-05, "loss": 0.7828, "num_input_tokens_seen": 5659328, "step": 9820 }, { "epoch": 1.4633601429848078, "grad_norm": 0.5882460474967957, "learning_rate": 4.9673910771794974e-05, "loss": 0.8407, "num_input_tokens_seen": 5662240, "step": 9825 }, { "epoch": 1.464104855525767, "grad_norm": 0.4993196129798889, "learning_rate": 4.967286370507912e-05, "loss": 0.8135, "num_input_tokens_seen": 5665376, "step": 9830 }, { "epoch": 1.4648495680667262, "grad_norm": 0.5837748646736145, "learning_rate": 4.967181497107145e-05, "loss": 0.8408, "num_input_tokens_seen": 5668480, "step": 9835 }, { "epoch": 1.4655942806076854, "grad_norm": 0.7095969915390015, "learning_rate": 4.967076456984283e-05, "loss": 0.7988, "num_input_tokens_seen": 5671296, "step": 9840 }, { "epoch": 1.4663389931486446, "grad_norm": 0.6008095741271973, "learning_rate": 4.966971250146425e-05, "loss": 0.8299, "num_input_tokens_seen": 5674048, "step": 9845 }, { "epoch": 1.4670837056896038, "grad_norm": 0.5000907778739929, "learning_rate": 4.966865876600679e-05, "loss": 0.7734, "num_input_tokens_seen": 5676544, "step": 9850 }, { "epoch": 1.467828418230563, "grad_norm": 0.6371526122093201, "learning_rate": 4.9667603363541676e-05, "loss": 0.811, "num_input_tokens_seen": 5679296, "step": 9855 }, { "epoch": 1.4685731307715222, "grad_norm": 0.7009953856468201, "learning_rate": 4.9666546294140216e-05, "loss": 0.8131, "num_input_tokens_seen": 5682016, "step": 9860 }, { "epoch": 1.4693178433124814, "grad_norm": 0.5561561584472656, "learning_rate": 4.9665487557873834e-05, "loss": 0.7827, "num_input_tokens_seen": 5685376, "step": 9865 }, { "epoch": 1.4700625558534406, "grad_norm": 1.004146933555603, "learning_rate": 4.9664427154814094e-05, "loss": 0.8349, "num_input_tokens_seen": 5688256, "step": 9870 }, { "epoch": 1.4708072683943998, "grad_norm": 0.5946017503738403, "learning_rate": 4.966336508503265e-05, "loss": 0.8191, "num_input_tokens_seen": 5691392, "step": 9875 }, { "epoch": 1.471551980935359, "grad_norm": 0.5507892966270447, "learning_rate": 4.966230134860126e-05, "loss": 0.8527, "num_input_tokens_seen": 5694176, "step": 9880 }, { "epoch": 1.4722966934763182, "grad_norm": 0.5284575819969177, "learning_rate": 4.966123594559182e-05, "loss": 0.8156, "num_input_tokens_seen": 5697088, "step": 9885 }, { "epoch": 1.4730414060172774, "grad_norm": 0.4624069631099701, "learning_rate": 4.966016887607631e-05, "loss": 0.8184, "num_input_tokens_seen": 5699776, "step": 9890 }, { "epoch": 1.4737861185582366, "grad_norm": 0.5198723673820496, "learning_rate": 4.9659100140126856e-05, "loss": 0.7844, "num_input_tokens_seen": 5702912, "step": 9895 }, { "epoch": 1.4745308310991958, "grad_norm": 0.42807748913764954, "learning_rate": 4.965802973781567e-05, "loss": 0.795, "num_input_tokens_seen": 5705664, "step": 9900 }, { "epoch": 1.475275543640155, "grad_norm": 0.6820382475852966, "learning_rate": 4.965695766921509e-05, "loss": 0.8252, "num_input_tokens_seen": 5708512, "step": 9905 }, { "epoch": 1.4760202561811142, "grad_norm": 0.6322929859161377, "learning_rate": 4.965588393439755e-05, "loss": 0.8419, "num_input_tokens_seen": 5711360, "step": 9910 }, { "epoch": 1.4767649687220734, "grad_norm": 0.8310784697532654, "learning_rate": 4.965480853343563e-05, "loss": 0.8108, "num_input_tokens_seen": 5713792, "step": 9915 }, { "epoch": 1.4775096812630324, "grad_norm": 0.45459476113319397, "learning_rate": 4.9653731466401975e-05, "loss": 0.7948, "num_input_tokens_seen": 5716256, "step": 9920 }, { "epoch": 1.4782543938039916, "grad_norm": 0.5131661295890808, "learning_rate": 4.965265273336939e-05, "loss": 0.7991, "num_input_tokens_seen": 5718912, "step": 9925 }, { "epoch": 1.4789991063449508, "grad_norm": 0.580951988697052, "learning_rate": 4.9651572334410757e-05, "loss": 0.8083, "num_input_tokens_seen": 5721632, "step": 9930 }, { "epoch": 1.47974381888591, "grad_norm": 0.6061341762542725, "learning_rate": 4.9650490269599096e-05, "loss": 0.8121, "num_input_tokens_seen": 5724544, "step": 9935 }, { "epoch": 1.4804885314268692, "grad_norm": 0.47675982117652893, "learning_rate": 4.964940653900753e-05, "loss": 0.8041, "num_input_tokens_seen": 5727328, "step": 9940 }, { "epoch": 1.4812332439678284, "grad_norm": 0.5914363861083984, "learning_rate": 4.964832114270928e-05, "loss": 0.7861, "num_input_tokens_seen": 5730144, "step": 9945 }, { "epoch": 1.4819779565087876, "grad_norm": 0.5943604111671448, "learning_rate": 4.96472340807777e-05, "loss": 0.8165, "num_input_tokens_seen": 5732864, "step": 9950 }, { "epoch": 1.4827226690497468, "grad_norm": 0.5324876308441162, "learning_rate": 4.964614535328626e-05, "loss": 0.8695, "num_input_tokens_seen": 5735904, "step": 9955 }, { "epoch": 1.483467381590706, "grad_norm": 0.6051291227340698, "learning_rate": 4.9645054960308504e-05, "loss": 0.8406, "num_input_tokens_seen": 5738752, "step": 9960 }, { "epoch": 1.4842120941316652, "grad_norm": 0.47845086455345154, "learning_rate": 4.964396290191814e-05, "loss": 0.8083, "num_input_tokens_seen": 5741504, "step": 9965 }, { "epoch": 1.4849568066726244, "grad_norm": 0.6315048933029175, "learning_rate": 4.964286917818895e-05, "loss": 0.8024, "num_input_tokens_seen": 5743968, "step": 9970 }, { "epoch": 1.4857015192135836, "grad_norm": 0.43673640489578247, "learning_rate": 4.964177378919487e-05, "loss": 0.8032, "num_input_tokens_seen": 5746912, "step": 9975 }, { "epoch": 1.4864462317545428, "grad_norm": 0.5174721479415894, "learning_rate": 4.9640676735009894e-05, "loss": 0.8269, "num_input_tokens_seen": 5749536, "step": 9980 }, { "epoch": 1.4871909442955018, "grad_norm": 0.41999098658561707, "learning_rate": 4.963957801570816e-05, "loss": 0.7475, "num_input_tokens_seen": 5752096, "step": 9985 }, { "epoch": 1.487935656836461, "grad_norm": 0.5415849089622498, "learning_rate": 4.963847763136393e-05, "loss": 0.8177, "num_input_tokens_seen": 5755040, "step": 9990 }, { "epoch": 1.4886803693774202, "grad_norm": 0.4758392870426178, "learning_rate": 4.9637375582051556e-05, "loss": 0.8289, "num_input_tokens_seen": 5758016, "step": 9995 }, { "epoch": 1.4894250819183794, "grad_norm": 0.540533185005188, "learning_rate": 4.96362718678455e-05, "loss": 0.8088, "num_input_tokens_seen": 5760832, "step": 10000 }, { "epoch": 1.4901697944593386, "grad_norm": 0.46679094433784485, "learning_rate": 4.9635166488820365e-05, "loss": 0.8097, "num_input_tokens_seen": 5763584, "step": 10005 }, { "epoch": 1.4909145070002978, "grad_norm": 0.41907140612602234, "learning_rate": 4.963405944505083e-05, "loss": 0.8047, "num_input_tokens_seen": 5766400, "step": 10010 }, { "epoch": 1.491659219541257, "grad_norm": 0.6174973845481873, "learning_rate": 4.9632950736611713e-05, "loss": 0.7778, "num_input_tokens_seen": 5769664, "step": 10015 }, { "epoch": 1.4924039320822162, "grad_norm": 0.45973891019821167, "learning_rate": 4.963184036357793e-05, "loss": 0.7952, "num_input_tokens_seen": 5772288, "step": 10020 }, { "epoch": 1.4931486446231754, "grad_norm": 0.4984581768512726, "learning_rate": 4.9630728326024535e-05, "loss": 0.8205, "num_input_tokens_seen": 5775232, "step": 10025 }, { "epoch": 1.4938933571641346, "grad_norm": 0.484140008687973, "learning_rate": 4.962961462402666e-05, "loss": 0.8446, "num_input_tokens_seen": 5777856, "step": 10030 }, { "epoch": 1.4946380697050938, "grad_norm": 0.5734777450561523, "learning_rate": 4.9628499257659553e-05, "loss": 0.8158, "num_input_tokens_seen": 5780960, "step": 10035 }, { "epoch": 1.495382782246053, "grad_norm": 0.45246613025665283, "learning_rate": 4.9627382226998605e-05, "loss": 0.8528, "num_input_tokens_seen": 5783936, "step": 10040 }, { "epoch": 1.4961274947870122, "grad_norm": 0.4359065592288971, "learning_rate": 4.9626263532119286e-05, "loss": 0.766, "num_input_tokens_seen": 5786720, "step": 10045 }, { "epoch": 1.4968722073279714, "grad_norm": 0.41728585958480835, "learning_rate": 4.962514317309721e-05, "loss": 0.7921, "num_input_tokens_seen": 5789472, "step": 10050 }, { "epoch": 1.4976169198689306, "grad_norm": 0.5060484409332275, "learning_rate": 4.962402115000808e-05, "loss": 0.7865, "num_input_tokens_seen": 5792192, "step": 10055 }, { "epoch": 1.4983616324098898, "grad_norm": 0.5244513750076294, "learning_rate": 4.962289746292771e-05, "loss": 0.7882, "num_input_tokens_seen": 5795264, "step": 10060 }, { "epoch": 1.499106344950849, "grad_norm": 0.6245842576026917, "learning_rate": 4.962177211193203e-05, "loss": 0.8943, "num_input_tokens_seen": 5798112, "step": 10065 }, { "epoch": 1.4998510574918082, "grad_norm": 0.44144967198371887, "learning_rate": 4.962064509709711e-05, "loss": 0.8288, "num_input_tokens_seen": 5801088, "step": 10070 }, { "epoch": 1.5, "eval_loss": 0.8164980411529541, "eval_runtime": 45.4461, "eval_samples_per_second": 65.66, "eval_steps_per_second": 16.415, "num_input_tokens_seen": 5801696, "step": 10071 }, { "epoch": 1.5005957700327675, "grad_norm": 0.6380800604820251, "learning_rate": 4.961951641849909e-05, "loss": 0.8238, "num_input_tokens_seen": 5804000, "step": 10075 }, { "epoch": 1.5013404825737267, "grad_norm": 0.48361021280288696, "learning_rate": 4.961838607621424e-05, "loss": 0.8286, "num_input_tokens_seen": 5807488, "step": 10080 }, { "epoch": 1.5020851951146859, "grad_norm": 0.47738075256347656, "learning_rate": 4.961725407031896e-05, "loss": 0.7948, "num_input_tokens_seen": 5810016, "step": 10085 }, { "epoch": 1.502829907655645, "grad_norm": 0.4242149293422699, "learning_rate": 4.961612040088973e-05, "loss": 0.8124, "num_input_tokens_seen": 5812800, "step": 10090 }, { "epoch": 1.5035746201966043, "grad_norm": 0.541104793548584, "learning_rate": 4.9614985068003163e-05, "loss": 0.7961, "num_input_tokens_seen": 5815552, "step": 10095 }, { "epoch": 1.5043193327375635, "grad_norm": 0.4758601486682892, "learning_rate": 4.9613848071735987e-05, "loss": 0.7703, "num_input_tokens_seen": 5818656, "step": 10100 }, { "epoch": 1.5050640452785224, "grad_norm": 0.48505568504333496, "learning_rate": 4.9612709412165024e-05, "loss": 0.8656, "num_input_tokens_seen": 5821600, "step": 10105 }, { "epoch": 1.5058087578194816, "grad_norm": 0.6085879802703857, "learning_rate": 4.961156908936724e-05, "loss": 0.8304, "num_input_tokens_seen": 5824448, "step": 10110 }, { "epoch": 1.5065534703604408, "grad_norm": 0.4297896921634674, "learning_rate": 4.961042710341967e-05, "loss": 0.823, "num_input_tokens_seen": 5827328, "step": 10115 }, { "epoch": 1.5072981829014, "grad_norm": 0.41708049178123474, "learning_rate": 4.96092834543995e-05, "loss": 0.7707, "num_input_tokens_seen": 5830304, "step": 10120 }, { "epoch": 1.5080428954423593, "grad_norm": 0.46726465225219727, "learning_rate": 4.9608138142384e-05, "loss": 0.8275, "num_input_tokens_seen": 5833120, "step": 10125 }, { "epoch": 1.5087876079833185, "grad_norm": 0.4709087312221527, "learning_rate": 4.9606991167450584e-05, "loss": 0.7948, "num_input_tokens_seen": 5835840, "step": 10130 }, { "epoch": 1.5095323205242777, "grad_norm": 0.45057228207588196, "learning_rate": 4.9605842529676746e-05, "loss": 0.8241, "num_input_tokens_seen": 5838752, "step": 10135 }, { "epoch": 1.5102770330652369, "grad_norm": 0.5572271347045898, "learning_rate": 4.9604692229140106e-05, "loss": 0.791, "num_input_tokens_seen": 5841792, "step": 10140 }, { "epoch": 1.5110217456061958, "grad_norm": 0.3724130690097809, "learning_rate": 4.96035402659184e-05, "loss": 0.8408, "num_input_tokens_seen": 5844672, "step": 10145 }, { "epoch": 1.511766458147155, "grad_norm": 0.46286800503730774, "learning_rate": 4.960238664008948e-05, "loss": 0.7858, "num_input_tokens_seen": 5847552, "step": 10150 }, { "epoch": 1.5125111706881142, "grad_norm": 0.34159529209136963, "learning_rate": 4.960123135173129e-05, "loss": 0.7778, "num_input_tokens_seen": 5850688, "step": 10155 }, { "epoch": 1.5132558832290735, "grad_norm": 0.4694283902645111, "learning_rate": 4.960007440092191e-05, "loss": 0.7758, "num_input_tokens_seen": 5853536, "step": 10160 }, { "epoch": 1.5140005957700327, "grad_norm": 0.6982734203338623, "learning_rate": 4.959891578773953e-05, "loss": 0.8848, "num_input_tokens_seen": 5856480, "step": 10165 }, { "epoch": 1.5147453083109919, "grad_norm": 0.4910726845264435, "learning_rate": 4.959775551226242e-05, "loss": 0.7529, "num_input_tokens_seen": 5859264, "step": 10170 }, { "epoch": 1.515490020851951, "grad_norm": 0.48424527049064636, "learning_rate": 4.9596593574569e-05, "loss": 0.8037, "num_input_tokens_seen": 5862112, "step": 10175 }, { "epoch": 1.5162347333929103, "grad_norm": 0.4841332733631134, "learning_rate": 4.9595429974737796e-05, "loss": 0.7795, "num_input_tokens_seen": 5865216, "step": 10180 }, { "epoch": 1.5169794459338695, "grad_norm": 0.39408519864082336, "learning_rate": 4.959426471284742e-05, "loss": 0.8597, "num_input_tokens_seen": 5867872, "step": 10185 }, { "epoch": 1.5177241584748287, "grad_norm": 0.4051976799964905, "learning_rate": 4.959309778897664e-05, "loss": 0.7742, "num_input_tokens_seen": 5870880, "step": 10190 }, { "epoch": 1.5184688710157879, "grad_norm": 0.4633775055408478, "learning_rate": 4.959192920320429e-05, "loss": 0.7781, "num_input_tokens_seen": 5873920, "step": 10195 }, { "epoch": 1.519213583556747, "grad_norm": 0.6017754673957825, "learning_rate": 4.959075895560935e-05, "loss": 0.8015, "num_input_tokens_seen": 5876928, "step": 10200 }, { "epoch": 1.5199582960977063, "grad_norm": 0.48910221457481384, "learning_rate": 4.9589587046270904e-05, "loss": 0.7978, "num_input_tokens_seen": 5879936, "step": 10205 }, { "epoch": 1.5207030086386655, "grad_norm": 0.5895496606826782, "learning_rate": 4.958841347526814e-05, "loss": 0.8558, "num_input_tokens_seen": 5883072, "step": 10210 }, { "epoch": 1.5214477211796247, "grad_norm": 0.6782458424568176, "learning_rate": 4.9587238242680356e-05, "loss": 0.8221, "num_input_tokens_seen": 5885984, "step": 10215 }, { "epoch": 1.5221924337205839, "grad_norm": 0.42867133021354675, "learning_rate": 4.958606134858697e-05, "loss": 0.8056, "num_input_tokens_seen": 5888704, "step": 10220 }, { "epoch": 1.522937146261543, "grad_norm": 0.4508671462535858, "learning_rate": 4.9584882793067534e-05, "loss": 0.8117, "num_input_tokens_seen": 5891488, "step": 10225 }, { "epoch": 1.5236818588025023, "grad_norm": 0.3717007040977478, "learning_rate": 4.958370257620166e-05, "loss": 0.8076, "num_input_tokens_seen": 5894176, "step": 10230 }, { "epoch": 1.5244265713434615, "grad_norm": 0.7464588284492493, "learning_rate": 4.958252069806912e-05, "loss": 0.8194, "num_input_tokens_seen": 5897088, "step": 10235 }, { "epoch": 1.5251712838844207, "grad_norm": 0.470481812953949, "learning_rate": 4.9581337158749784e-05, "loss": 0.8166, "num_input_tokens_seen": 5899840, "step": 10240 }, { "epoch": 1.52591599642538, "grad_norm": 0.6940283179283142, "learning_rate": 4.958015195832362e-05, "loss": 0.7908, "num_input_tokens_seen": 5902816, "step": 10245 }, { "epoch": 1.526660708966339, "grad_norm": 0.4420122802257538, "learning_rate": 4.957896509687072e-05, "loss": 0.7711, "num_input_tokens_seen": 5905792, "step": 10250 }, { "epoch": 1.5274054215072983, "grad_norm": 0.48810097575187683, "learning_rate": 4.957777657447128e-05, "loss": 0.7747, "num_input_tokens_seen": 5909152, "step": 10255 }, { "epoch": 1.5281501340482575, "grad_norm": 0.3733593225479126, "learning_rate": 4.957658639120564e-05, "loss": 0.8063, "num_input_tokens_seen": 5912352, "step": 10260 }, { "epoch": 1.5288948465892167, "grad_norm": 0.5610625147819519, "learning_rate": 4.957539454715421e-05, "loss": 0.8306, "num_input_tokens_seen": 5915264, "step": 10265 }, { "epoch": 1.529639559130176, "grad_norm": 0.4969543516635895, "learning_rate": 4.957420104239753e-05, "loss": 0.8559, "num_input_tokens_seen": 5918144, "step": 10270 }, { "epoch": 1.5303842716711349, "grad_norm": 0.5715532898902893, "learning_rate": 4.9573005877016255e-05, "loss": 0.8065, "num_input_tokens_seen": 5920896, "step": 10275 }, { "epoch": 1.531128984212094, "grad_norm": 0.4943716526031494, "learning_rate": 4.957180905109115e-05, "loss": 0.8154, "num_input_tokens_seen": 5923904, "step": 10280 }, { "epoch": 1.5318736967530533, "grad_norm": 0.5711119771003723, "learning_rate": 4.9570610564703086e-05, "loss": 0.8206, "num_input_tokens_seen": 5926688, "step": 10285 }, { "epoch": 1.5326184092940125, "grad_norm": 0.7700261473655701, "learning_rate": 4.956941041793306e-05, "loss": 0.7786, "num_input_tokens_seen": 5929536, "step": 10290 }, { "epoch": 1.5333631218349717, "grad_norm": 0.47454917430877686, "learning_rate": 4.956820861086217e-05, "loss": 0.8048, "num_input_tokens_seen": 5932352, "step": 10295 }, { "epoch": 1.534107834375931, "grad_norm": 0.5503412485122681, "learning_rate": 4.956700514357163e-05, "loss": 0.8273, "num_input_tokens_seen": 5935136, "step": 10300 }, { "epoch": 1.53485254691689, "grad_norm": 0.4382669925689697, "learning_rate": 4.956580001614277e-05, "loss": 0.8032, "num_input_tokens_seen": 5938048, "step": 10305 }, { "epoch": 1.5355972594578493, "grad_norm": 0.4741092026233673, "learning_rate": 4.9564593228657016e-05, "loss": 0.8319, "num_input_tokens_seen": 5940768, "step": 10310 }, { "epoch": 1.5363419719988085, "grad_norm": 0.4472988545894623, "learning_rate": 4.956338478119592e-05, "loss": 0.8155, "num_input_tokens_seen": 5943552, "step": 10315 }, { "epoch": 1.5370866845397675, "grad_norm": 0.39492106437683105, "learning_rate": 4.956217467384116e-05, "loss": 0.7808, "num_input_tokens_seen": 5946624, "step": 10320 }, { "epoch": 1.5378313970807267, "grad_norm": 0.4421440362930298, "learning_rate": 4.9560962906674493e-05, "loss": 0.8038, "num_input_tokens_seen": 5949440, "step": 10325 }, { "epoch": 1.538576109621686, "grad_norm": 0.6412917375564575, "learning_rate": 4.9559749479777805e-05, "loss": 0.8147, "num_input_tokens_seen": 5952576, "step": 10330 }, { "epoch": 1.539320822162645, "grad_norm": 0.4444758892059326, "learning_rate": 4.9558534393233104e-05, "loss": 0.7917, "num_input_tokens_seen": 5955680, "step": 10335 }, { "epoch": 1.5400655347036043, "grad_norm": 0.4505847096443176, "learning_rate": 4.955731764712249e-05, "loss": 0.7939, "num_input_tokens_seen": 5958368, "step": 10340 }, { "epoch": 1.5408102472445635, "grad_norm": 0.5785138607025146, "learning_rate": 4.9556099241528194e-05, "loss": 0.8398, "num_input_tokens_seen": 5961120, "step": 10345 }, { "epoch": 1.5415549597855227, "grad_norm": 0.5907132625579834, "learning_rate": 4.955487917653256e-05, "loss": 0.7868, "num_input_tokens_seen": 5964288, "step": 10350 }, { "epoch": 1.542299672326482, "grad_norm": 0.4579623341560364, "learning_rate": 4.955365745221802e-05, "loss": 0.8212, "num_input_tokens_seen": 5966944, "step": 10355 }, { "epoch": 1.543044384867441, "grad_norm": 0.5684084892272949, "learning_rate": 4.955243406866713e-05, "loss": 0.7792, "num_input_tokens_seen": 5969792, "step": 10360 }, { "epoch": 1.5437890974084003, "grad_norm": 0.5288793444633484, "learning_rate": 4.9551209025962575e-05, "loss": 0.8447, "num_input_tokens_seen": 5972800, "step": 10365 }, { "epoch": 1.5445338099493595, "grad_norm": 0.49152398109436035, "learning_rate": 4.9549982324187125e-05, "loss": 0.8101, "num_input_tokens_seen": 5975616, "step": 10370 }, { "epoch": 1.5452785224903187, "grad_norm": 0.5486689209938049, "learning_rate": 4.954875396342369e-05, "loss": 0.7919, "num_input_tokens_seen": 5978368, "step": 10375 }, { "epoch": 1.546023235031278, "grad_norm": 0.38043516874313354, "learning_rate": 4.954752394375527e-05, "loss": 0.7934, "num_input_tokens_seen": 5981152, "step": 10380 }, { "epoch": 1.5467679475722371, "grad_norm": 0.554171621799469, "learning_rate": 4.9546292265264985e-05, "loss": 0.8355, "num_input_tokens_seen": 5984224, "step": 10385 }, { "epoch": 1.5475126601131963, "grad_norm": 0.461764395236969, "learning_rate": 4.9545058928036056e-05, "loss": 0.8542, "num_input_tokens_seen": 5987072, "step": 10390 }, { "epoch": 1.5482573726541555, "grad_norm": 0.5249062180519104, "learning_rate": 4.9543823932151845e-05, "loss": 0.8232, "num_input_tokens_seen": 5990048, "step": 10395 }, { "epoch": 1.5490020851951147, "grad_norm": 0.4738244116306305, "learning_rate": 4.954258727769581e-05, "loss": 0.8286, "num_input_tokens_seen": 5993024, "step": 10400 }, { "epoch": 1.549746797736074, "grad_norm": 0.4624103903770447, "learning_rate": 4.9541348964751497e-05, "loss": 0.8598, "num_input_tokens_seen": 5995968, "step": 10405 }, { "epoch": 1.5504915102770331, "grad_norm": 0.467610627412796, "learning_rate": 4.95401089934026e-05, "loss": 0.7798, "num_input_tokens_seen": 5998848, "step": 10410 }, { "epoch": 1.5512362228179923, "grad_norm": 0.3935079574584961, "learning_rate": 4.953886736373291e-05, "loss": 0.7696, "num_input_tokens_seen": 6001760, "step": 10415 }, { "epoch": 1.5519809353589515, "grad_norm": 0.5149455070495605, "learning_rate": 4.953762407582634e-05, "loss": 0.8618, "num_input_tokens_seen": 6004768, "step": 10420 }, { "epoch": 1.5527256478999107, "grad_norm": 0.47537359595298767, "learning_rate": 4.953637912976688e-05, "loss": 0.7882, "num_input_tokens_seen": 6007552, "step": 10425 }, { "epoch": 1.55347036044087, "grad_norm": 0.8231884837150574, "learning_rate": 4.9535132525638696e-05, "loss": 0.7725, "num_input_tokens_seen": 6010464, "step": 10430 }, { "epoch": 1.5542150729818291, "grad_norm": 0.49431338906288147, "learning_rate": 4.9533884263526e-05, "loss": 0.7948, "num_input_tokens_seen": 6013792, "step": 10435 }, { "epoch": 1.5549597855227884, "grad_norm": 0.4601791501045227, "learning_rate": 4.953263434351315e-05, "loss": 0.8538, "num_input_tokens_seen": 6016480, "step": 10440 }, { "epoch": 1.5557044980637476, "grad_norm": 0.3429465591907501, "learning_rate": 4.953138276568462e-05, "loss": 0.8464, "num_input_tokens_seen": 6019072, "step": 10445 }, { "epoch": 1.5564492106047065, "grad_norm": 0.6047995686531067, "learning_rate": 4.953012953012498e-05, "loss": 0.8328, "num_input_tokens_seen": 6022048, "step": 10450 }, { "epoch": 1.5571939231456657, "grad_norm": 0.42163771390914917, "learning_rate": 4.952887463691891e-05, "loss": 0.8461, "num_input_tokens_seen": 6024896, "step": 10455 }, { "epoch": 1.557938635686625, "grad_norm": 0.46793973445892334, "learning_rate": 4.9527618086151226e-05, "loss": 0.8166, "num_input_tokens_seen": 6027744, "step": 10460 }, { "epoch": 1.5586833482275841, "grad_norm": 0.4094576835632324, "learning_rate": 4.952635987790683e-05, "loss": 0.8233, "num_input_tokens_seen": 6030464, "step": 10465 }, { "epoch": 1.5594280607685433, "grad_norm": 0.5451374650001526, "learning_rate": 4.9525100012270754e-05, "loss": 0.791, "num_input_tokens_seen": 6033344, "step": 10470 }, { "epoch": 1.5601727733095025, "grad_norm": 0.7666795253753662, "learning_rate": 4.9523838489328134e-05, "loss": 0.8059, "num_input_tokens_seen": 6036256, "step": 10475 }, { "epoch": 1.5609174858504618, "grad_norm": 0.6071085929870605, "learning_rate": 4.952257530916421e-05, "loss": 0.8245, "num_input_tokens_seen": 6039008, "step": 10480 }, { "epoch": 1.561662198391421, "grad_norm": 0.4566490948200226, "learning_rate": 4.9521310471864346e-05, "loss": 0.8083, "num_input_tokens_seen": 6041792, "step": 10485 }, { "epoch": 1.5624069109323802, "grad_norm": 0.29258856177330017, "learning_rate": 4.952004397751402e-05, "loss": 0.7864, "num_input_tokens_seen": 6044768, "step": 10490 }, { "epoch": 1.5631516234733391, "grad_norm": 0.5019757151603699, "learning_rate": 4.951877582619881e-05, "loss": 0.8015, "num_input_tokens_seen": 6047808, "step": 10495 }, { "epoch": 1.5638963360142983, "grad_norm": 0.3188088536262512, "learning_rate": 4.951750601800442e-05, "loss": 0.7943, "num_input_tokens_seen": 6050720, "step": 10500 }, { "epoch": 1.5646410485552575, "grad_norm": 0.4711715281009674, "learning_rate": 4.9516234553016656e-05, "loss": 0.8535, "num_input_tokens_seen": 6053760, "step": 10505 }, { "epoch": 1.5653857610962167, "grad_norm": 0.7672199010848999, "learning_rate": 4.951496143132143e-05, "loss": 0.828, "num_input_tokens_seen": 6056704, "step": 10510 }, { "epoch": 1.566130473637176, "grad_norm": 0.42728903889656067, "learning_rate": 4.9513686653004785e-05, "loss": 0.8021, "num_input_tokens_seen": 6059424, "step": 10515 }, { "epoch": 1.5668751861781351, "grad_norm": 0.5273323655128479, "learning_rate": 4.951241021815286e-05, "loss": 0.8188, "num_input_tokens_seen": 6062144, "step": 10520 }, { "epoch": 1.5676198987190944, "grad_norm": 0.4848068952560425, "learning_rate": 4.9511132126851914e-05, "loss": 0.8241, "num_input_tokens_seen": 6065248, "step": 10525 }, { "epoch": 1.5683646112600536, "grad_norm": 0.6115229725837708, "learning_rate": 4.950985237918831e-05, "loss": 0.7991, "num_input_tokens_seen": 6068128, "step": 10530 }, { "epoch": 1.5691093238010128, "grad_norm": 0.5478389263153076, "learning_rate": 4.950857097524854e-05, "loss": 0.7945, "num_input_tokens_seen": 6070848, "step": 10535 }, { "epoch": 1.569854036341972, "grad_norm": 0.4364960193634033, "learning_rate": 4.950728791511918e-05, "loss": 0.7886, "num_input_tokens_seen": 6073952, "step": 10540 }, { "epoch": 1.5705987488829312, "grad_norm": 0.45773959159851074, "learning_rate": 4.950600319888695e-05, "loss": 0.8016, "num_input_tokens_seen": 6076960, "step": 10545 }, { "epoch": 1.5713434614238904, "grad_norm": 0.5760870575904846, "learning_rate": 4.9504716826638655e-05, "loss": 0.7928, "num_input_tokens_seen": 6079712, "step": 10550 }, { "epoch": 1.5720881739648496, "grad_norm": 0.6531453132629395, "learning_rate": 4.9503428798461226e-05, "loss": 0.7695, "num_input_tokens_seen": 6082272, "step": 10555 }, { "epoch": 1.5728328865058088, "grad_norm": 0.30610567331314087, "learning_rate": 4.95021391144417e-05, "loss": 0.8091, "num_input_tokens_seen": 6084960, "step": 10560 }, { "epoch": 1.573577599046768, "grad_norm": 0.45590758323669434, "learning_rate": 4.950084777466724e-05, "loss": 0.8319, "num_input_tokens_seen": 6087808, "step": 10565 }, { "epoch": 1.5743223115877272, "grad_norm": 0.44087544083595276, "learning_rate": 4.949955477922509e-05, "loss": 0.7652, "num_input_tokens_seen": 6090496, "step": 10570 }, { "epoch": 1.5750670241286864, "grad_norm": 0.5528504252433777, "learning_rate": 4.9498260128202635e-05, "loss": 0.7833, "num_input_tokens_seen": 6093600, "step": 10575 }, { "epoch": 1.5758117366696456, "grad_norm": 0.39693230390548706, "learning_rate": 4.949696382168737e-05, "loss": 0.802, "num_input_tokens_seen": 6096352, "step": 10580 }, { "epoch": 1.5765564492106048, "grad_norm": 0.5828707218170166, "learning_rate": 4.949566585976688e-05, "loss": 0.8024, "num_input_tokens_seen": 6098976, "step": 10585 }, { "epoch": 1.577301161751564, "grad_norm": 0.5023820400238037, "learning_rate": 4.949436624252889e-05, "loss": 0.7761, "num_input_tokens_seen": 6101600, "step": 10590 }, { "epoch": 1.5780458742925232, "grad_norm": 0.46682947874069214, "learning_rate": 4.949306497006121e-05, "loss": 0.8399, "num_input_tokens_seen": 6104256, "step": 10595 }, { "epoch": 1.5787905868334824, "grad_norm": 0.6322054862976074, "learning_rate": 4.949176204245178e-05, "loss": 0.807, "num_input_tokens_seen": 6107040, "step": 10600 }, { "epoch": 1.5795352993744416, "grad_norm": 0.6273335218429565, "learning_rate": 4.949045745978866e-05, "loss": 0.7951, "num_input_tokens_seen": 6109984, "step": 10605 }, { "epoch": 1.5802800119154008, "grad_norm": 0.5090467929840088, "learning_rate": 4.9489151222159984e-05, "loss": 0.7722, "num_input_tokens_seen": 6112960, "step": 10610 }, { "epoch": 1.58102472445636, "grad_norm": 0.4401690661907196, "learning_rate": 4.948784332965404e-05, "loss": 0.8109, "num_input_tokens_seen": 6115840, "step": 10615 }, { "epoch": 1.5817694369973192, "grad_norm": 0.6280370354652405, "learning_rate": 4.94865337823592e-05, "loss": 0.8561, "num_input_tokens_seen": 6118848, "step": 10620 }, { "epoch": 1.5825141495382782, "grad_norm": 0.4338701069355011, "learning_rate": 4.948522258036397e-05, "loss": 0.8383, "num_input_tokens_seen": 6121760, "step": 10625 }, { "epoch": 1.5832588620792374, "grad_norm": 0.5355629324913025, "learning_rate": 4.948390972375694e-05, "loss": 0.8274, "num_input_tokens_seen": 6124672, "step": 10630 }, { "epoch": 1.5840035746201966, "grad_norm": 0.5509082674980164, "learning_rate": 4.948259521262684e-05, "loss": 0.8578, "num_input_tokens_seen": 6127392, "step": 10635 }, { "epoch": 1.5847482871611558, "grad_norm": 0.5221002101898193, "learning_rate": 4.948127904706249e-05, "loss": 0.7825, "num_input_tokens_seen": 6130368, "step": 10640 }, { "epoch": 1.585492999702115, "grad_norm": 0.41906097531318665, "learning_rate": 4.947996122715283e-05, "loss": 0.7986, "num_input_tokens_seen": 6133216, "step": 10645 }, { "epoch": 1.5862377122430742, "grad_norm": 0.5795125365257263, "learning_rate": 4.947864175298693e-05, "loss": 0.8316, "num_input_tokens_seen": 6136224, "step": 10650 }, { "epoch": 1.5869824247840334, "grad_norm": 0.5099833011627197, "learning_rate": 4.9477320624653937e-05, "loss": 0.8433, "num_input_tokens_seen": 6139168, "step": 10655 }, { "epoch": 1.5877271373249926, "grad_norm": 0.4952910840511322, "learning_rate": 4.9475997842243136e-05, "loss": 0.8189, "num_input_tokens_seen": 6142144, "step": 10660 }, { "epoch": 1.5884718498659516, "grad_norm": 0.47893020510673523, "learning_rate": 4.947467340584391e-05, "loss": 0.8072, "num_input_tokens_seen": 6145056, "step": 10665 }, { "epoch": 1.5892165624069108, "grad_norm": 0.47621336579322815, "learning_rate": 4.947334731554577e-05, "loss": 0.7956, "num_input_tokens_seen": 6147776, "step": 10670 }, { "epoch": 1.58996127494787, "grad_norm": 0.5611395239830017, "learning_rate": 4.947201957143831e-05, "loss": 0.7965, "num_input_tokens_seen": 6150752, "step": 10675 }, { "epoch": 1.5907059874888292, "grad_norm": 0.44887784123420715, "learning_rate": 4.947069017361127e-05, "loss": 0.7915, "num_input_tokens_seen": 6153504, "step": 10680 }, { "epoch": 1.5914507000297884, "grad_norm": 0.4248636066913605, "learning_rate": 4.9469359122154476e-05, "loss": 0.8211, "num_input_tokens_seen": 6156576, "step": 10685 }, { "epoch": 1.5921954125707476, "grad_norm": 0.7302814722061157, "learning_rate": 4.946802641715788e-05, "loss": 0.8259, "num_input_tokens_seen": 6159232, "step": 10690 }, { "epoch": 1.5929401251117068, "grad_norm": 0.5640091300010681, "learning_rate": 4.9466692058711536e-05, "loss": 0.8523, "num_input_tokens_seen": 6162400, "step": 10695 }, { "epoch": 1.593684837652666, "grad_norm": 0.5125372409820557, "learning_rate": 4.946535604690562e-05, "loss": 0.8126, "num_input_tokens_seen": 6165536, "step": 10700 }, { "epoch": 1.5944295501936252, "grad_norm": 0.5747396945953369, "learning_rate": 4.94640183818304e-05, "loss": 0.8102, "num_input_tokens_seen": 6168352, "step": 10705 }, { "epoch": 1.5951742627345844, "grad_norm": 0.42025884985923767, "learning_rate": 4.9462679063576286e-05, "loss": 0.7953, "num_input_tokens_seen": 6171232, "step": 10710 }, { "epoch": 1.5959189752755436, "grad_norm": 0.6316786408424377, "learning_rate": 4.946133809223379e-05, "loss": 0.831, "num_input_tokens_seen": 6174176, "step": 10715 }, { "epoch": 1.5966636878165028, "grad_norm": 0.4738353192806244, "learning_rate": 4.945999546789351e-05, "loss": 0.8151, "num_input_tokens_seen": 6177728, "step": 10720 }, { "epoch": 1.597408400357462, "grad_norm": 0.4475695788860321, "learning_rate": 4.9458651190646185e-05, "loss": 0.789, "num_input_tokens_seen": 6180576, "step": 10725 }, { "epoch": 1.5981531128984212, "grad_norm": 0.4643925428390503, "learning_rate": 4.945730526058265e-05, "loss": 0.8057, "num_input_tokens_seen": 6183776, "step": 10730 }, { "epoch": 1.5988978254393804, "grad_norm": 0.5685144066810608, "learning_rate": 4.9455957677793865e-05, "loss": 0.8106, "num_input_tokens_seen": 6186368, "step": 10735 }, { "epoch": 1.5996425379803396, "grad_norm": 0.4255041480064392, "learning_rate": 4.94546084423709e-05, "loss": 0.8157, "num_input_tokens_seen": 6189120, "step": 10740 }, { "epoch": 1.6003872505212988, "grad_norm": 0.3491296172142029, "learning_rate": 4.945325755440491e-05, "loss": 0.8408, "num_input_tokens_seen": 6191872, "step": 10745 }, { "epoch": 1.601131963062258, "grad_norm": 0.5547944903373718, "learning_rate": 4.945190501398719e-05, "loss": 0.8084, "num_input_tokens_seen": 6194816, "step": 10750 }, { "epoch": 1.6018766756032172, "grad_norm": 0.47252410650253296, "learning_rate": 4.945055082120915e-05, "loss": 0.8283, "num_input_tokens_seen": 6197664, "step": 10755 }, { "epoch": 1.6026213881441764, "grad_norm": 0.4868457615375519, "learning_rate": 4.94491949761623e-05, "loss": 0.8433, "num_input_tokens_seen": 6200640, "step": 10760 }, { "epoch": 1.6033661006851356, "grad_norm": 0.4026935398578644, "learning_rate": 4.944783747893825e-05, "loss": 0.8287, "num_input_tokens_seen": 6203552, "step": 10765 }, { "epoch": 1.6041108132260948, "grad_norm": 0.4029526710510254, "learning_rate": 4.944647832962874e-05, "loss": 0.8205, "num_input_tokens_seen": 6206336, "step": 10770 }, { "epoch": 1.604855525767054, "grad_norm": 0.43473875522613525, "learning_rate": 4.9445117528325625e-05, "loss": 0.8405, "num_input_tokens_seen": 6209216, "step": 10775 }, { "epoch": 1.6056002383080132, "grad_norm": 0.4795618951320648, "learning_rate": 4.9443755075120844e-05, "loss": 0.8116, "num_input_tokens_seen": 6212192, "step": 10780 }, { "epoch": 1.6063449508489724, "grad_norm": 0.4841545820236206, "learning_rate": 4.944239097010648e-05, "loss": 0.7938, "num_input_tokens_seen": 6215424, "step": 10785 }, { "epoch": 1.6070896633899316, "grad_norm": 0.4633064866065979, "learning_rate": 4.9441025213374706e-05, "loss": 0.7939, "num_input_tokens_seen": 6218528, "step": 10790 }, { "epoch": 1.6078343759308906, "grad_norm": 0.415550172328949, "learning_rate": 4.9439657805017825e-05, "loss": 0.8307, "num_input_tokens_seen": 6221760, "step": 10795 }, { "epoch": 1.6085790884718498, "grad_norm": 0.3139588236808777, "learning_rate": 4.9438288745128234e-05, "loss": 0.7644, "num_input_tokens_seen": 6224320, "step": 10800 }, { "epoch": 1.609323801012809, "grad_norm": 0.42902714014053345, "learning_rate": 4.943691803379844e-05, "loss": 0.8149, "num_input_tokens_seen": 6227296, "step": 10805 }, { "epoch": 1.6100685135537682, "grad_norm": 0.47416752576828003, "learning_rate": 4.9435545671121085e-05, "loss": 0.7935, "num_input_tokens_seen": 6230048, "step": 10810 }, { "epoch": 1.6108132260947274, "grad_norm": 0.38886958360671997, "learning_rate": 4.94341716571889e-05, "loss": 0.8068, "num_input_tokens_seen": 6233120, "step": 10815 }, { "epoch": 1.6115579386356866, "grad_norm": 0.430869460105896, "learning_rate": 4.943279599209474e-05, "loss": 0.8159, "num_input_tokens_seen": 6235936, "step": 10820 }, { "epoch": 1.6123026511766458, "grad_norm": 0.4099065959453583, "learning_rate": 4.943141867593155e-05, "loss": 0.8263, "num_input_tokens_seen": 6238848, "step": 10825 }, { "epoch": 1.613047363717605, "grad_norm": 0.39866894483566284, "learning_rate": 4.943003970879243e-05, "loss": 0.7783, "num_input_tokens_seen": 6241504, "step": 10830 }, { "epoch": 1.6137920762585642, "grad_norm": 0.4526078701019287, "learning_rate": 4.942865909077055e-05, "loss": 0.8239, "num_input_tokens_seen": 6244160, "step": 10835 }, { "epoch": 1.6145367887995232, "grad_norm": 0.4831126928329468, "learning_rate": 4.942727682195921e-05, "loss": 0.7734, "num_input_tokens_seen": 6247072, "step": 10840 }, { "epoch": 1.6152815013404824, "grad_norm": 0.40639591217041016, "learning_rate": 4.942589290245181e-05, "loss": 0.8083, "num_input_tokens_seen": 6249792, "step": 10845 }, { "epoch": 1.6160262138814416, "grad_norm": 0.6397366523742676, "learning_rate": 4.9424507332341874e-05, "loss": 0.8162, "num_input_tokens_seen": 6252736, "step": 10850 }, { "epoch": 1.6167709264224008, "grad_norm": 0.49152326583862305, "learning_rate": 4.942312011172304e-05, "loss": 0.8042, "num_input_tokens_seen": 6255744, "step": 10855 }, { "epoch": 1.61751563896336, "grad_norm": 0.5058335661888123, "learning_rate": 4.942173124068905e-05, "loss": 0.7812, "num_input_tokens_seen": 6258688, "step": 10860 }, { "epoch": 1.6182603515043192, "grad_norm": 0.5646466612815857, "learning_rate": 4.9420340719333746e-05, "loss": 0.7993, "num_input_tokens_seen": 6261440, "step": 10865 }, { "epoch": 1.6190050640452784, "grad_norm": 0.38115909695625305, "learning_rate": 4.941894854775111e-05, "loss": 0.8255, "num_input_tokens_seen": 6264416, "step": 10870 }, { "epoch": 1.6197497765862376, "grad_norm": 0.46577104926109314, "learning_rate": 4.9417554726035206e-05, "loss": 0.8114, "num_input_tokens_seen": 6267360, "step": 10875 }, { "epoch": 1.6204944891271968, "grad_norm": 0.4057011604309082, "learning_rate": 4.941615925428024e-05, "loss": 0.7973, "num_input_tokens_seen": 6270304, "step": 10880 }, { "epoch": 1.621239201668156, "grad_norm": 0.4527948796749115, "learning_rate": 4.9414762132580486e-05, "loss": 0.793, "num_input_tokens_seen": 6273216, "step": 10885 }, { "epoch": 1.6219839142091153, "grad_norm": 0.5475317239761353, "learning_rate": 4.9413363361030374e-05, "loss": 0.8397, "num_input_tokens_seen": 6275936, "step": 10890 }, { "epoch": 1.6227286267500745, "grad_norm": 0.39813005924224854, "learning_rate": 4.941196293972442e-05, "loss": 0.7783, "num_input_tokens_seen": 6278720, "step": 10895 }, { "epoch": 1.6234733392910337, "grad_norm": 0.35620400309562683, "learning_rate": 4.941056086875727e-05, "loss": 0.8006, "num_input_tokens_seen": 6281568, "step": 10900 }, { "epoch": 1.6242180518319929, "grad_norm": 0.5379520058631897, "learning_rate": 4.940915714822366e-05, "loss": 0.8414, "num_input_tokens_seen": 6284608, "step": 10905 }, { "epoch": 1.624962764372952, "grad_norm": 0.6156772375106812, "learning_rate": 4.940775177821845e-05, "loss": 0.7617, "num_input_tokens_seen": 6287264, "step": 10910 }, { "epoch": 1.6257074769139113, "grad_norm": 0.39006611704826355, "learning_rate": 4.9406344758836606e-05, "loss": 0.8204, "num_input_tokens_seen": 6290208, "step": 10915 }, { "epoch": 1.6264521894548705, "grad_norm": 0.5243722200393677, "learning_rate": 4.9404936090173214e-05, "loss": 0.8385, "num_input_tokens_seen": 6293280, "step": 10920 }, { "epoch": 1.6271969019958297, "grad_norm": 0.46881595253944397, "learning_rate": 4.9403525772323466e-05, "loss": 0.8257, "num_input_tokens_seen": 6296000, "step": 10925 }, { "epoch": 1.6279416145367889, "grad_norm": 0.7060712575912476, "learning_rate": 4.9402113805382665e-05, "loss": 0.8039, "num_input_tokens_seen": 6298976, "step": 10930 }, { "epoch": 1.628686327077748, "grad_norm": 0.5238814949989319, "learning_rate": 4.9400700189446226e-05, "loss": 0.8179, "num_input_tokens_seen": 6302304, "step": 10935 }, { "epoch": 1.6294310396187073, "grad_norm": 0.49234601855278015, "learning_rate": 4.939928492460967e-05, "loss": 0.8137, "num_input_tokens_seen": 6305312, "step": 10940 }, { "epoch": 1.6301757521596665, "grad_norm": 0.5038599371910095, "learning_rate": 4.939786801096864e-05, "loss": 0.8009, "num_input_tokens_seen": 6308128, "step": 10945 }, { "epoch": 1.6309204647006257, "grad_norm": 0.47133389115333557, "learning_rate": 4.9396449448618886e-05, "loss": 0.8269, "num_input_tokens_seen": 6310656, "step": 10950 }, { "epoch": 1.6316651772415849, "grad_norm": 0.42226457595825195, "learning_rate": 4.9395029237656266e-05, "loss": 0.7861, "num_input_tokens_seen": 6313312, "step": 10955 }, { "epoch": 1.632409889782544, "grad_norm": 0.5727774500846863, "learning_rate": 4.939360737817675e-05, "loss": 0.7968, "num_input_tokens_seen": 6316288, "step": 10960 }, { "epoch": 1.6331546023235033, "grad_norm": 0.4862099289894104, "learning_rate": 4.939218387027643e-05, "loss": 0.8152, "num_input_tokens_seen": 6318976, "step": 10965 }, { "epoch": 1.6338993148644623, "grad_norm": 0.3499930500984192, "learning_rate": 4.939075871405149e-05, "loss": 0.8034, "num_input_tokens_seen": 6321888, "step": 10970 }, { "epoch": 1.6346440274054215, "grad_norm": 0.43122929334640503, "learning_rate": 4.9389331909598246e-05, "loss": 0.825, "num_input_tokens_seen": 6324800, "step": 10975 }, { "epoch": 1.6353887399463807, "grad_norm": 0.49700281023979187, "learning_rate": 4.938790345701311e-05, "loss": 0.7879, "num_input_tokens_seen": 6327424, "step": 10980 }, { "epoch": 1.6361334524873399, "grad_norm": 0.39605462551116943, "learning_rate": 4.9386473356392614e-05, "loss": 0.8234, "num_input_tokens_seen": 6330016, "step": 10985 }, { "epoch": 1.636878165028299, "grad_norm": 0.5743822455406189, "learning_rate": 4.93850416078334e-05, "loss": 0.823, "num_input_tokens_seen": 6332736, "step": 10990 }, { "epoch": 1.6376228775692583, "grad_norm": 0.4133007228374481, "learning_rate": 4.938360821143221e-05, "loss": 0.7568, "num_input_tokens_seen": 6335680, "step": 10995 }, { "epoch": 1.6383675901102175, "grad_norm": 0.3617090880870819, "learning_rate": 4.938217316728592e-05, "loss": 0.7833, "num_input_tokens_seen": 6338400, "step": 11000 }, { "epoch": 1.6391123026511767, "grad_norm": 0.48863595724105835, "learning_rate": 4.9380736475491484e-05, "loss": 0.7939, "num_input_tokens_seen": 6340928, "step": 11005 }, { "epoch": 1.6398570151921357, "grad_norm": 0.518606424331665, "learning_rate": 4.9379298136146016e-05, "loss": 0.7835, "num_input_tokens_seen": 6343520, "step": 11010 }, { "epoch": 1.6406017277330949, "grad_norm": 0.6743053793907166, "learning_rate": 4.9377858149346686e-05, "loss": 0.8282, "num_input_tokens_seen": 6346304, "step": 11015 }, { "epoch": 1.641346440274054, "grad_norm": 0.38457512855529785, "learning_rate": 4.937641651519083e-05, "loss": 0.8148, "num_input_tokens_seen": 6348992, "step": 11020 }, { "epoch": 1.6420911528150133, "grad_norm": 0.4957287013530731, "learning_rate": 4.937497323377584e-05, "loss": 0.8684, "num_input_tokens_seen": 6351776, "step": 11025 }, { "epoch": 1.6428358653559725, "grad_norm": 0.41461431980133057, "learning_rate": 4.9373528305199273e-05, "loss": 0.8547, "num_input_tokens_seen": 6354368, "step": 11030 }, { "epoch": 1.6435805778969317, "grad_norm": 0.5211139917373657, "learning_rate": 4.937208172955876e-05, "loss": 0.863, "num_input_tokens_seen": 6357440, "step": 11035 }, { "epoch": 1.6443252904378909, "grad_norm": 0.4219489097595215, "learning_rate": 4.937063350695204e-05, "loss": 0.8034, "num_input_tokens_seen": 6359968, "step": 11040 }, { "epoch": 1.64507000297885, "grad_norm": 0.3791712522506714, "learning_rate": 4.9369183637477e-05, "loss": 0.8306, "num_input_tokens_seen": 6362848, "step": 11045 }, { "epoch": 1.6458147155198093, "grad_norm": 0.42962369322776794, "learning_rate": 4.93677321212316e-05, "loss": 0.8015, "num_input_tokens_seen": 6365792, "step": 11050 }, { "epoch": 1.6465594280607685, "grad_norm": 0.4524526298046112, "learning_rate": 4.936627895831394e-05, "loss": 0.8537, "num_input_tokens_seen": 6368480, "step": 11055 }, { "epoch": 1.6473041406017277, "grad_norm": 0.6012473702430725, "learning_rate": 4.936482414882222e-05, "loss": 0.8131, "num_input_tokens_seen": 6371712, "step": 11060 }, { "epoch": 1.648048853142687, "grad_norm": 0.5123582482337952, "learning_rate": 4.9363367692854735e-05, "loss": 0.8049, "num_input_tokens_seen": 6374688, "step": 11065 }, { "epoch": 1.648793565683646, "grad_norm": 0.41748085618019104, "learning_rate": 4.9361909590509924e-05, "loss": 0.8066, "num_input_tokens_seen": 6377504, "step": 11070 }, { "epoch": 1.6495382782246053, "grad_norm": 0.4008704423904419, "learning_rate": 4.9360449841886304e-05, "loss": 0.7884, "num_input_tokens_seen": 6380160, "step": 11075 }, { "epoch": 1.6502829907655645, "grad_norm": 0.6648157835006714, "learning_rate": 4.935898844708253e-05, "loss": 0.8634, "num_input_tokens_seen": 6383200, "step": 11080 }, { "epoch": 1.6510277033065237, "grad_norm": 0.38839730620384216, "learning_rate": 4.9357525406197345e-05, "loss": 0.8047, "num_input_tokens_seen": 6385952, "step": 11085 }, { "epoch": 1.651772415847483, "grad_norm": 0.39027491211891174, "learning_rate": 4.9356060719329636e-05, "loss": 0.8165, "num_input_tokens_seen": 6388736, "step": 11090 }, { "epoch": 1.6525171283884421, "grad_norm": 0.4896872937679291, "learning_rate": 4.935459438657836e-05, "loss": 0.8074, "num_input_tokens_seen": 6391456, "step": 11095 }, { "epoch": 1.6532618409294013, "grad_norm": 0.3448595106601715, "learning_rate": 4.9353126408042616e-05, "loss": 0.7885, "num_input_tokens_seen": 6394336, "step": 11100 }, { "epoch": 1.6540065534703605, "grad_norm": 0.3886382579803467, "learning_rate": 4.9351656783821606e-05, "loss": 0.8082, "num_input_tokens_seen": 6397120, "step": 11105 }, { "epoch": 1.6547512660113197, "grad_norm": 0.5409761667251587, "learning_rate": 4.935018551401463e-05, "loss": 0.8335, "num_input_tokens_seen": 6400000, "step": 11110 }, { "epoch": 1.655495978552279, "grad_norm": 0.48380839824676514, "learning_rate": 4.934871259872112e-05, "loss": 0.8386, "num_input_tokens_seen": 6403168, "step": 11115 }, { "epoch": 1.6562406910932381, "grad_norm": 0.46150243282318115, "learning_rate": 4.9347238038040614e-05, "loss": 0.8472, "num_input_tokens_seen": 6405952, "step": 11120 }, { "epoch": 1.6569854036341973, "grad_norm": 0.4164988398551941, "learning_rate": 4.934576183207274e-05, "loss": 0.8214, "num_input_tokens_seen": 6408992, "step": 11125 }, { "epoch": 1.6577301161751565, "grad_norm": 0.39992621541023254, "learning_rate": 4.9344283980917273e-05, "loss": 0.7904, "num_input_tokens_seen": 6411680, "step": 11130 }, { "epoch": 1.6584748287161157, "grad_norm": 0.40958845615386963, "learning_rate": 4.9342804484674064e-05, "loss": 0.8167, "num_input_tokens_seen": 6414624, "step": 11135 }, { "epoch": 1.6592195412570747, "grad_norm": 0.4195314943790436, "learning_rate": 4.93413233434431e-05, "loss": 0.8182, "num_input_tokens_seen": 6417504, "step": 11140 }, { "epoch": 1.659964253798034, "grad_norm": 0.5391464829444885, "learning_rate": 4.933984055732447e-05, "loss": 0.8274, "num_input_tokens_seen": 6420192, "step": 11145 }, { "epoch": 1.6607089663389931, "grad_norm": 0.43090397119522095, "learning_rate": 4.9338356126418375e-05, "loss": 0.8066, "num_input_tokens_seen": 6423136, "step": 11150 }, { "epoch": 1.6614536788799523, "grad_norm": 0.45954039692878723, "learning_rate": 4.9336870050825124e-05, "loss": 0.7966, "num_input_tokens_seen": 6425920, "step": 11155 }, { "epoch": 1.6621983914209115, "grad_norm": 0.46544191241264343, "learning_rate": 4.933538233064514e-05, "loss": 0.811, "num_input_tokens_seen": 6428896, "step": 11160 }, { "epoch": 1.6629431039618707, "grad_norm": 0.5804521441459656, "learning_rate": 4.9333892965978955e-05, "loss": 0.8255, "num_input_tokens_seen": 6432032, "step": 11165 }, { "epoch": 1.66368781650283, "grad_norm": 0.3964213728904724, "learning_rate": 4.9332401956927224e-05, "loss": 0.7903, "num_input_tokens_seen": 6435008, "step": 11170 }, { "epoch": 1.6644325290437891, "grad_norm": 0.4982607960700989, "learning_rate": 4.93309093035907e-05, "loss": 0.8395, "num_input_tokens_seen": 6438176, "step": 11175 }, { "epoch": 1.6651772415847483, "grad_norm": 0.49231675267219543, "learning_rate": 4.932941500607025e-05, "loss": 0.8345, "num_input_tokens_seen": 6441088, "step": 11180 }, { "epoch": 1.6659219541257073, "grad_norm": 0.4384683072566986, "learning_rate": 4.9327919064466835e-05, "loss": 0.8199, "num_input_tokens_seen": 6444448, "step": 11185 }, { "epoch": 1.6666666666666665, "grad_norm": 0.4395730793476105, "learning_rate": 4.932642147888157e-05, "loss": 0.8077, "num_input_tokens_seen": 6447136, "step": 11190 }, { "epoch": 1.6674113792076257, "grad_norm": 0.44463682174682617, "learning_rate": 4.932492224941565e-05, "loss": 0.8047, "num_input_tokens_seen": 6449696, "step": 11195 }, { "epoch": 1.668156091748585, "grad_norm": 0.5003184080123901, "learning_rate": 4.932342137617037e-05, "loss": 0.8619, "num_input_tokens_seen": 6452736, "step": 11200 }, { "epoch": 1.6689008042895441, "grad_norm": 0.3913111090660095, "learning_rate": 4.932191885924717e-05, "loss": 0.7756, "num_input_tokens_seen": 6455712, "step": 11205 }, { "epoch": 1.6696455168305033, "grad_norm": 0.46113258600234985, "learning_rate": 4.9320414698747586e-05, "loss": 0.8227, "num_input_tokens_seen": 6458464, "step": 11210 }, { "epoch": 1.6703902293714625, "grad_norm": 0.6011428236961365, "learning_rate": 4.931890889477325e-05, "loss": 0.8695, "num_input_tokens_seen": 6461696, "step": 11215 }, { "epoch": 1.6711349419124217, "grad_norm": 0.3728460967540741, "learning_rate": 4.931740144742593e-05, "loss": 0.8014, "num_input_tokens_seen": 6464896, "step": 11220 }, { "epoch": 1.671879654453381, "grad_norm": 0.44375506043434143, "learning_rate": 4.931589235680748e-05, "loss": 0.78, "num_input_tokens_seen": 6467904, "step": 11225 }, { "epoch": 1.6726243669943401, "grad_norm": 0.513886034488678, "learning_rate": 4.931438162301989e-05, "loss": 0.8114, "num_input_tokens_seen": 6470688, "step": 11230 }, { "epoch": 1.6733690795352993, "grad_norm": 0.41198116540908813, "learning_rate": 4.931286924616524e-05, "loss": 0.7872, "num_input_tokens_seen": 6473536, "step": 11235 }, { "epoch": 1.6741137920762585, "grad_norm": 0.36260566115379333, "learning_rate": 4.931135522634574e-05, "loss": 0.8234, "num_input_tokens_seen": 6476352, "step": 11240 }, { "epoch": 1.6748585046172177, "grad_norm": 0.49706369638442993, "learning_rate": 4.930983956366369e-05, "loss": 0.8103, "num_input_tokens_seen": 6479008, "step": 11245 }, { "epoch": 1.675603217158177, "grad_norm": 0.48792698979377747, "learning_rate": 4.930832225822153e-05, "loss": 0.7946, "num_input_tokens_seen": 6482016, "step": 11250 }, { "epoch": 1.6763479296991362, "grad_norm": 0.3249003291130066, "learning_rate": 4.9306803310121776e-05, "loss": 0.768, "num_input_tokens_seen": 6484672, "step": 11255 }, { "epoch": 1.6770926422400954, "grad_norm": 0.4473438560962677, "learning_rate": 4.9305282719467076e-05, "loss": 0.8078, "num_input_tokens_seen": 6487584, "step": 11260 }, { "epoch": 1.6778373547810546, "grad_norm": 0.47021397948265076, "learning_rate": 4.93037604863602e-05, "loss": 0.8737, "num_input_tokens_seen": 6490400, "step": 11265 }, { "epoch": 1.6785820673220138, "grad_norm": 0.42966192960739136, "learning_rate": 4.930223661090398e-05, "loss": 0.8158, "num_input_tokens_seen": 6493440, "step": 11270 }, { "epoch": 1.679326779862973, "grad_norm": 0.41701552271842957, "learning_rate": 4.930071109320144e-05, "loss": 0.8068, "num_input_tokens_seen": 6496096, "step": 11275 }, { "epoch": 1.6800714924039322, "grad_norm": 0.4656935930252075, "learning_rate": 4.929918393335563e-05, "loss": 0.758, "num_input_tokens_seen": 6498880, "step": 11280 }, { "epoch": 1.6808162049448914, "grad_norm": 0.3871816098690033, "learning_rate": 4.9297655131469763e-05, "loss": 0.8225, "num_input_tokens_seen": 6501440, "step": 11285 }, { "epoch": 1.6815609174858506, "grad_norm": 0.4734969735145569, "learning_rate": 4.929612468764715e-05, "loss": 0.8112, "num_input_tokens_seen": 6504736, "step": 11290 }, { "epoch": 1.6823056300268098, "grad_norm": 0.5523364543914795, "learning_rate": 4.929459260199122e-05, "loss": 0.82, "num_input_tokens_seen": 6507424, "step": 11295 }, { "epoch": 1.683050342567769, "grad_norm": 0.47447073459625244, "learning_rate": 4.9293058874605485e-05, "loss": 0.8085, "num_input_tokens_seen": 6510560, "step": 11300 }, { "epoch": 1.6837950551087282, "grad_norm": 0.4463036060333252, "learning_rate": 4.9291523505593604e-05, "loss": 0.8078, "num_input_tokens_seen": 6513696, "step": 11305 }, { "epoch": 1.6845397676496874, "grad_norm": 0.3346408009529114, "learning_rate": 4.928998649505933e-05, "loss": 0.81, "num_input_tokens_seen": 6516800, "step": 11310 }, { "epoch": 1.6852844801906464, "grad_norm": 0.726409912109375, "learning_rate": 4.9288447843106525e-05, "loss": 0.8611, "num_input_tokens_seen": 6519840, "step": 11315 }, { "epoch": 1.6860291927316056, "grad_norm": 0.5186454057693481, "learning_rate": 4.9286907549839156e-05, "loss": 0.8025, "num_input_tokens_seen": 6522816, "step": 11320 }, { "epoch": 1.6867739052725648, "grad_norm": 0.4063074588775635, "learning_rate": 4.928536561536132e-05, "loss": 0.8098, "num_input_tokens_seen": 6525664, "step": 11325 }, { "epoch": 1.687518617813524, "grad_norm": 0.6508936882019043, "learning_rate": 4.928382203977722e-05, "loss": 0.8455, "num_input_tokens_seen": 6529088, "step": 11330 }, { "epoch": 1.6882633303544832, "grad_norm": 0.4064018726348877, "learning_rate": 4.9282276823191154e-05, "loss": 0.8037, "num_input_tokens_seen": 6531840, "step": 11335 }, { "epoch": 1.6890080428954424, "grad_norm": 0.424971342086792, "learning_rate": 4.9280729965707545e-05, "loss": 0.8083, "num_input_tokens_seen": 6534688, "step": 11340 }, { "epoch": 1.6897527554364016, "grad_norm": 0.4050409197807312, "learning_rate": 4.9279181467430926e-05, "loss": 0.828, "num_input_tokens_seen": 6537664, "step": 11345 }, { "epoch": 1.6904974679773608, "grad_norm": 0.4541911482810974, "learning_rate": 4.927763132846593e-05, "loss": 0.8308, "num_input_tokens_seen": 6540544, "step": 11350 }, { "epoch": 1.69124218051832, "grad_norm": 0.420908123254776, "learning_rate": 4.927607954891732e-05, "loss": 0.81, "num_input_tokens_seen": 6543648, "step": 11355 }, { "epoch": 1.691986893059279, "grad_norm": 0.3647322952747345, "learning_rate": 4.927452612888994e-05, "loss": 0.7918, "num_input_tokens_seen": 6546496, "step": 11360 }, { "epoch": 1.6927316056002382, "grad_norm": 0.43882015347480774, "learning_rate": 4.9272971068488795e-05, "loss": 0.7805, "num_input_tokens_seen": 6549568, "step": 11365 }, { "epoch": 1.6934763181411974, "grad_norm": 0.4696386456489563, "learning_rate": 4.9271414367818944e-05, "loss": 0.8154, "num_input_tokens_seen": 6552672, "step": 11370 }, { "epoch": 1.6942210306821566, "grad_norm": 0.4014117419719696, "learning_rate": 4.926985602698559e-05, "loss": 0.7865, "num_input_tokens_seen": 6555424, "step": 11375 }, { "epoch": 1.6949657432231158, "grad_norm": 0.5243474245071411, "learning_rate": 4.926829604609404e-05, "loss": 0.7717, "num_input_tokens_seen": 6558304, "step": 11380 }, { "epoch": 1.695710455764075, "grad_norm": 0.36359596252441406, "learning_rate": 4.926673442524971e-05, "loss": 0.7834, "num_input_tokens_seen": 6561120, "step": 11385 }, { "epoch": 1.6964551683050342, "grad_norm": 0.3708619177341461, "learning_rate": 4.926517116455813e-05, "loss": 0.7743, "num_input_tokens_seen": 6563968, "step": 11390 }, { "epoch": 1.6971998808459934, "grad_norm": 0.5157620906829834, "learning_rate": 4.926360626412494e-05, "loss": 0.7993, "num_input_tokens_seen": 6566656, "step": 11395 }, { "epoch": 1.6979445933869526, "grad_norm": 0.44184842705726624, "learning_rate": 4.926203972405588e-05, "loss": 0.7834, "num_input_tokens_seen": 6569696, "step": 11400 }, { "epoch": 1.6986893059279118, "grad_norm": 0.5486021637916565, "learning_rate": 4.926047154445683e-05, "loss": 0.8271, "num_input_tokens_seen": 6572608, "step": 11405 }, { "epoch": 1.699434018468871, "grad_norm": 0.5112899541854858, "learning_rate": 4.925890172543374e-05, "loss": 0.8274, "num_input_tokens_seen": 6576096, "step": 11410 }, { "epoch": 1.7001787310098302, "grad_norm": 0.4656943380832672, "learning_rate": 4.92573302670927e-05, "loss": 0.8395, "num_input_tokens_seen": 6578752, "step": 11415 }, { "epoch": 1.7009234435507894, "grad_norm": 0.4676955044269562, "learning_rate": 4.9255757169539905e-05, "loss": 0.7992, "num_input_tokens_seen": 6581376, "step": 11420 }, { "epoch": 1.7016681560917486, "grad_norm": 0.3627585172653198, "learning_rate": 4.9254182432881654e-05, "loss": 0.797, "num_input_tokens_seen": 6584352, "step": 11425 }, { "epoch": 1.7024128686327078, "grad_norm": 0.3929328918457031, "learning_rate": 4.9252606057224373e-05, "loss": 0.7893, "num_input_tokens_seen": 6587168, "step": 11430 }, { "epoch": 1.703157581173667, "grad_norm": 0.3628696799278259, "learning_rate": 4.9251028042674573e-05, "loss": 0.7994, "num_input_tokens_seen": 6590144, "step": 11435 }, { "epoch": 1.7039022937146262, "grad_norm": 0.38984575867652893, "learning_rate": 4.9249448389338905e-05, "loss": 0.8164, "num_input_tokens_seen": 6593024, "step": 11440 }, { "epoch": 1.7046470062555854, "grad_norm": 0.29212111234664917, "learning_rate": 4.9247867097324095e-05, "loss": 0.7764, "num_input_tokens_seen": 6596032, "step": 11445 }, { "epoch": 1.7053917187965446, "grad_norm": 0.3787009119987488, "learning_rate": 4.924628416673701e-05, "loss": 0.8301, "num_input_tokens_seen": 6598752, "step": 11450 }, { "epoch": 1.7061364313375038, "grad_norm": 0.7095714211463928, "learning_rate": 4.9244699597684625e-05, "loss": 0.9255, "num_input_tokens_seen": 6602528, "step": 11455 }, { "epoch": 1.706881143878463, "grad_norm": 0.5704317092895508, "learning_rate": 4.924311339027401e-05, "loss": 0.78, "num_input_tokens_seen": 6605472, "step": 11460 }, { "epoch": 1.7076258564194222, "grad_norm": 0.42391273379325867, "learning_rate": 4.924152554461236e-05, "loss": 0.8233, "num_input_tokens_seen": 6608512, "step": 11465 }, { "epoch": 1.7083705689603814, "grad_norm": 0.5260253548622131, "learning_rate": 4.9239936060806965e-05, "loss": 0.8296, "num_input_tokens_seen": 6611392, "step": 11470 }, { "epoch": 1.7091152815013406, "grad_norm": 0.5273864269256592, "learning_rate": 4.9238344938965254e-05, "loss": 0.7828, "num_input_tokens_seen": 6614624, "step": 11475 }, { "epoch": 1.7098599940422998, "grad_norm": 0.42038846015930176, "learning_rate": 4.923675217919473e-05, "loss": 0.7929, "num_input_tokens_seen": 6617536, "step": 11480 }, { "epoch": 1.710604706583259, "grad_norm": 0.4120687246322632, "learning_rate": 4.923515778160304e-05, "loss": 0.7985, "num_input_tokens_seen": 6620448, "step": 11485 }, { "epoch": 1.711349419124218, "grad_norm": 0.4802159070968628, "learning_rate": 4.9233561746297917e-05, "loss": 0.7984, "num_input_tokens_seen": 6623264, "step": 11490 }, { "epoch": 1.7120941316651772, "grad_norm": 0.40632733702659607, "learning_rate": 4.923196407338721e-05, "loss": 0.8294, "num_input_tokens_seen": 6626368, "step": 11495 }, { "epoch": 1.7128388442061364, "grad_norm": 0.6047766804695129, "learning_rate": 4.923036476297891e-05, "loss": 0.8258, "num_input_tokens_seen": 6629408, "step": 11500 }, { "epoch": 1.7135835567470956, "grad_norm": 0.8205471634864807, "learning_rate": 4.922876381518106e-05, "loss": 0.7931, "num_input_tokens_seen": 6632256, "step": 11505 }, { "epoch": 1.7143282692880548, "grad_norm": 0.34005579352378845, "learning_rate": 4.922716123010186e-05, "loss": 0.81, "num_input_tokens_seen": 6635232, "step": 11510 }, { "epoch": 1.715072981829014, "grad_norm": 0.41488537192344666, "learning_rate": 4.92255570078496e-05, "loss": 0.7899, "num_input_tokens_seen": 6638016, "step": 11515 }, { "epoch": 1.7158176943699732, "grad_norm": 0.4721550941467285, "learning_rate": 4.92239511485327e-05, "loss": 0.7899, "num_input_tokens_seen": 6641024, "step": 11520 }, { "epoch": 1.7165624069109324, "grad_norm": 0.3408076763153076, "learning_rate": 4.922234365225966e-05, "loss": 0.8591, "num_input_tokens_seen": 6643840, "step": 11525 }, { "epoch": 1.7173071194518914, "grad_norm": 0.4139745831489563, "learning_rate": 4.922073451913912e-05, "loss": 0.8338, "num_input_tokens_seen": 6646720, "step": 11530 }, { "epoch": 1.7180518319928506, "grad_norm": 0.4479590654373169, "learning_rate": 4.9219123749279816e-05, "loss": 0.8302, "num_input_tokens_seen": 6649760, "step": 11535 }, { "epoch": 1.7187965445338098, "grad_norm": 0.3647913634777069, "learning_rate": 4.92175113427906e-05, "loss": 0.7828, "num_input_tokens_seen": 6652448, "step": 11540 }, { "epoch": 1.719541257074769, "grad_norm": 0.39913254976272583, "learning_rate": 4.9215897299780426e-05, "loss": 0.8271, "num_input_tokens_seen": 6655328, "step": 11545 }, { "epoch": 1.7202859696157282, "grad_norm": 0.420478880405426, "learning_rate": 4.9214281620358374e-05, "loss": 0.8288, "num_input_tokens_seen": 6658368, "step": 11550 }, { "epoch": 1.7210306821566874, "grad_norm": 0.5455960035324097, "learning_rate": 4.92126643046336e-05, "loss": 0.824, "num_input_tokens_seen": 6661344, "step": 11555 }, { "epoch": 1.7217753946976466, "grad_norm": 0.5462584495544434, "learning_rate": 4.921104535271543e-05, "loss": 0.8475, "num_input_tokens_seen": 6664320, "step": 11560 }, { "epoch": 1.7225201072386058, "grad_norm": 0.41627171635627747, "learning_rate": 4.9209424764713246e-05, "loss": 0.811, "num_input_tokens_seen": 6667296, "step": 11565 }, { "epoch": 1.723264819779565, "grad_norm": 0.41291990876197815, "learning_rate": 4.920780254073656e-05, "loss": 0.8024, "num_input_tokens_seen": 6669952, "step": 11570 }, { "epoch": 1.7240095323205242, "grad_norm": 0.5039787888526917, "learning_rate": 4.920617868089501e-05, "loss": 0.7813, "num_input_tokens_seen": 6673056, "step": 11575 }, { "epoch": 1.7247542448614834, "grad_norm": 0.4110114276409149, "learning_rate": 4.9204553185298315e-05, "loss": 0.7898, "num_input_tokens_seen": 6675840, "step": 11580 }, { "epoch": 1.7254989574024426, "grad_norm": 0.45712390542030334, "learning_rate": 4.920292605405632e-05, "loss": 0.822, "num_input_tokens_seen": 6678464, "step": 11585 }, { "epoch": 1.7262436699434018, "grad_norm": 0.47552287578582764, "learning_rate": 4.9201297287278994e-05, "loss": 0.8246, "num_input_tokens_seen": 6681120, "step": 11590 }, { "epoch": 1.726988382484361, "grad_norm": 0.3646025061607361, "learning_rate": 4.919966688507638e-05, "loss": 0.7904, "num_input_tokens_seen": 6683872, "step": 11595 }, { "epoch": 1.7277330950253202, "grad_norm": 0.5294280648231506, "learning_rate": 4.919803484755867e-05, "loss": 0.7989, "num_input_tokens_seen": 6686912, "step": 11600 }, { "epoch": 1.7284778075662794, "grad_norm": 0.43214648962020874, "learning_rate": 4.919640117483616e-05, "loss": 0.8211, "num_input_tokens_seen": 6689792, "step": 11605 }, { "epoch": 1.7292225201072386, "grad_norm": 0.857628583908081, "learning_rate": 4.9194765867019214e-05, "loss": 0.889, "num_input_tokens_seen": 6692448, "step": 11610 }, { "epoch": 1.7299672326481979, "grad_norm": 0.47157323360443115, "learning_rate": 4.919312892421837e-05, "loss": 0.8045, "num_input_tokens_seen": 6695264, "step": 11615 }, { "epoch": 1.730711945189157, "grad_norm": 0.4868423044681549, "learning_rate": 4.9191490346544236e-05, "loss": 0.8323, "num_input_tokens_seen": 6698112, "step": 11620 }, { "epoch": 1.7314566577301163, "grad_norm": 0.517056405544281, "learning_rate": 4.918985013410754e-05, "loss": 0.8363, "num_input_tokens_seen": 6700864, "step": 11625 }, { "epoch": 1.7322013702710755, "grad_norm": 0.5984353423118591, "learning_rate": 4.918820828701912e-05, "loss": 0.8208, "num_input_tokens_seen": 6703808, "step": 11630 }, { "epoch": 1.7329460828120347, "grad_norm": 0.3649909198284149, "learning_rate": 4.9186564805389923e-05, "loss": 0.7902, "num_input_tokens_seen": 6707072, "step": 11635 }, { "epoch": 1.7336907953529939, "grad_norm": 0.502221941947937, "learning_rate": 4.918491968933101e-05, "loss": 0.8035, "num_input_tokens_seen": 6709728, "step": 11640 }, { "epoch": 1.734435507893953, "grad_norm": 0.3773782253265381, "learning_rate": 4.918327293895356e-05, "loss": 0.8007, "num_input_tokens_seen": 6712480, "step": 11645 }, { "epoch": 1.7351802204349123, "grad_norm": 0.3699693977832794, "learning_rate": 4.918162455436884e-05, "loss": 0.8329, "num_input_tokens_seen": 6715328, "step": 11650 }, { "epoch": 1.7359249329758715, "grad_norm": 0.44053611159324646, "learning_rate": 4.9179974535688256e-05, "loss": 0.796, "num_input_tokens_seen": 6718176, "step": 11655 }, { "epoch": 1.7366696455168305, "grad_norm": 0.4799290597438812, "learning_rate": 4.91783228830233e-05, "loss": 0.8293, "num_input_tokens_seen": 6721408, "step": 11660 }, { "epoch": 1.7374143580577897, "grad_norm": 0.4006984233856201, "learning_rate": 4.9176669596485584e-05, "loss": 0.8112, "num_input_tokens_seen": 6724096, "step": 11665 }, { "epoch": 1.7381590705987489, "grad_norm": 0.3836367130279541, "learning_rate": 4.917501467618682e-05, "loss": 0.7987, "num_input_tokens_seen": 6726944, "step": 11670 }, { "epoch": 1.738903783139708, "grad_norm": 0.4044206142425537, "learning_rate": 4.917335812223887e-05, "loss": 0.8026, "num_input_tokens_seen": 6729888, "step": 11675 }, { "epoch": 1.7396484956806673, "grad_norm": 0.47448426485061646, "learning_rate": 4.917169993475366e-05, "loss": 0.8246, "num_input_tokens_seen": 6732544, "step": 11680 }, { "epoch": 1.7403932082216265, "grad_norm": 0.5092507004737854, "learning_rate": 4.917004011384323e-05, "loss": 0.7961, "num_input_tokens_seen": 6735488, "step": 11685 }, { "epoch": 1.7411379207625857, "grad_norm": 0.3447153568267822, "learning_rate": 4.916837865961976e-05, "loss": 0.8501, "num_input_tokens_seen": 6738304, "step": 11690 }, { "epoch": 1.7418826333035449, "grad_norm": 0.3914327025413513, "learning_rate": 4.916671557219553e-05, "loss": 0.7848, "num_input_tokens_seen": 6741280, "step": 11695 }, { "epoch": 1.742627345844504, "grad_norm": 0.41843631863594055, "learning_rate": 4.916505085168291e-05, "loss": 0.7865, "num_input_tokens_seen": 6744000, "step": 11700 }, { "epoch": 1.743372058385463, "grad_norm": 0.3632400631904602, "learning_rate": 4.91633844981944e-05, "loss": 0.8012, "num_input_tokens_seen": 6746720, "step": 11705 }, { "epoch": 1.7441167709264223, "grad_norm": 0.38952383399009705, "learning_rate": 4.9161716511842614e-05, "loss": 0.8295, "num_input_tokens_seen": 6749856, "step": 11710 }, { "epoch": 1.7448614834673815, "grad_norm": 0.37470003962516785, "learning_rate": 4.916004689274026e-05, "loss": 0.8414, "num_input_tokens_seen": 6752640, "step": 11715 }, { "epoch": 1.7456061960083407, "grad_norm": 0.31112414598464966, "learning_rate": 4.915837564100016e-05, "loss": 0.8156, "num_input_tokens_seen": 6755520, "step": 11720 }, { "epoch": 1.7463509085492999, "grad_norm": 0.2821838855743408, "learning_rate": 4.915670275673525e-05, "loss": 0.8233, "num_input_tokens_seen": 6758368, "step": 11725 }, { "epoch": 1.747095621090259, "grad_norm": 0.5951010584831238, "learning_rate": 4.915502824005859e-05, "loss": 0.8082, "num_input_tokens_seen": 6761408, "step": 11730 }, { "epoch": 1.7478403336312183, "grad_norm": 0.3510800004005432, "learning_rate": 4.915335209108333e-05, "loss": 0.8188, "num_input_tokens_seen": 6764640, "step": 11735 }, { "epoch": 1.7485850461721775, "grad_norm": 0.3181931674480438, "learning_rate": 4.9151674309922736e-05, "loss": 0.779, "num_input_tokens_seen": 6767456, "step": 11740 }, { "epoch": 1.7493297587131367, "grad_norm": 0.43612393736839294, "learning_rate": 4.914999489669018e-05, "loss": 0.8392, "num_input_tokens_seen": 6770336, "step": 11745 }, { "epoch": 1.7500744712540959, "grad_norm": 0.46347540616989136, "learning_rate": 4.9148313851499156e-05, "loss": 0.8087, "num_input_tokens_seen": 6772992, "step": 11750 }, { "epoch": 1.750819183795055, "grad_norm": 0.3176230490207672, "learning_rate": 4.914663117446327e-05, "loss": 0.8322, "num_input_tokens_seen": 6775744, "step": 11755 }, { "epoch": 1.7515638963360143, "grad_norm": 0.37462300062179565, "learning_rate": 4.9144946865696204e-05, "loss": 0.8038, "num_input_tokens_seen": 6778688, "step": 11760 }, { "epoch": 1.7523086088769735, "grad_norm": 0.27620530128479004, "learning_rate": 4.9143260925311814e-05, "loss": 0.7979, "num_input_tokens_seen": 6781568, "step": 11765 }, { "epoch": 1.7530533214179327, "grad_norm": 0.5108064413070679, "learning_rate": 4.9141573353424e-05, "loss": 0.8338, "num_input_tokens_seen": 6784960, "step": 11770 }, { "epoch": 1.7537980339588919, "grad_norm": 0.4336259663105011, "learning_rate": 4.913988415014681e-05, "loss": 0.8114, "num_input_tokens_seen": 6788032, "step": 11775 }, { "epoch": 1.754542746499851, "grad_norm": 0.45515117049217224, "learning_rate": 4.9138193315594404e-05, "loss": 0.8332, "num_input_tokens_seen": 6790880, "step": 11780 }, { "epoch": 1.7552874590408103, "grad_norm": 0.359269917011261, "learning_rate": 4.913650084988103e-05, "loss": 0.7753, "num_input_tokens_seen": 6793696, "step": 11785 }, { "epoch": 1.7560321715817695, "grad_norm": 0.47187840938568115, "learning_rate": 4.9134806753121055e-05, "loss": 0.7968, "num_input_tokens_seen": 6796352, "step": 11790 }, { "epoch": 1.7567768841227287, "grad_norm": 0.4799261689186096, "learning_rate": 4.913311102542897e-05, "loss": 0.8198, "num_input_tokens_seen": 6799168, "step": 11795 }, { "epoch": 1.757521596663688, "grad_norm": 0.5470964312553406, "learning_rate": 4.913141366691936e-05, "loss": 0.8038, "num_input_tokens_seen": 6802432, "step": 11800 }, { "epoch": 1.758266309204647, "grad_norm": 0.39853963255882263, "learning_rate": 4.912971467770692e-05, "loss": 0.8034, "num_input_tokens_seen": 6805504, "step": 11805 }, { "epoch": 1.7590110217456063, "grad_norm": 0.41015222668647766, "learning_rate": 4.912801405790647e-05, "loss": 0.8098, "num_input_tokens_seen": 6808448, "step": 11810 }, { "epoch": 1.7597557342865655, "grad_norm": 0.4234744608402252, "learning_rate": 4.9126311807632926e-05, "loss": 0.7849, "num_input_tokens_seen": 6811424, "step": 11815 }, { "epoch": 1.7605004468275247, "grad_norm": 0.37154221534729004, "learning_rate": 4.912460792700132e-05, "loss": 0.824, "num_input_tokens_seen": 6813984, "step": 11820 }, { "epoch": 1.761245159368484, "grad_norm": 0.47694578766822815, "learning_rate": 4.912290241612679e-05, "loss": 0.8008, "num_input_tokens_seen": 6816928, "step": 11825 }, { "epoch": 1.7619898719094431, "grad_norm": 0.4755688011646271, "learning_rate": 4.91211952751246e-05, "loss": 0.8144, "num_input_tokens_seen": 6819776, "step": 11830 }, { "epoch": 1.762734584450402, "grad_norm": 0.4271766245365143, "learning_rate": 4.9119486504110105e-05, "loss": 0.7841, "num_input_tokens_seen": 6822496, "step": 11835 }, { "epoch": 1.7634792969913613, "grad_norm": 0.47397947311401367, "learning_rate": 4.911777610319877e-05, "loss": 0.8027, "num_input_tokens_seen": 6825472, "step": 11840 }, { "epoch": 1.7642240095323205, "grad_norm": 0.4140625596046448, "learning_rate": 4.911606407250617e-05, "loss": 0.8106, "num_input_tokens_seen": 6828352, "step": 11845 }, { "epoch": 1.7649687220732797, "grad_norm": 0.6019092202186584, "learning_rate": 4.9114350412148026e-05, "loss": 0.8277, "num_input_tokens_seen": 6831104, "step": 11850 }, { "epoch": 1.765713434614239, "grad_norm": 0.5186338424682617, "learning_rate": 4.911263512224011e-05, "loss": 0.8023, "num_input_tokens_seen": 6834336, "step": 11855 }, { "epoch": 1.766458147155198, "grad_norm": 0.3655867576599121, "learning_rate": 4.911091820289836e-05, "loss": 0.8383, "num_input_tokens_seen": 6836992, "step": 11860 }, { "epoch": 1.7672028596961573, "grad_norm": 0.6042576432228088, "learning_rate": 4.910919965423878e-05, "loss": 0.8005, "num_input_tokens_seen": 6839744, "step": 11865 }, { "epoch": 1.7679475722371165, "grad_norm": 0.47624510526657104, "learning_rate": 4.91074794763775e-05, "loss": 0.8156, "num_input_tokens_seen": 6842528, "step": 11870 }, { "epoch": 1.7686922847780755, "grad_norm": 0.5859829783439636, "learning_rate": 4.910575766943079e-05, "loss": 0.8154, "num_input_tokens_seen": 6845600, "step": 11875 }, { "epoch": 1.7694369973190347, "grad_norm": 0.4561615586280823, "learning_rate": 4.9104034233514965e-05, "loss": 0.8019, "num_input_tokens_seen": 6848384, "step": 11880 }, { "epoch": 1.770181709859994, "grad_norm": 0.49845555424690247, "learning_rate": 4.910230916874651e-05, "loss": 0.8159, "num_input_tokens_seen": 6851072, "step": 11885 }, { "epoch": 1.770926422400953, "grad_norm": 0.4469635486602783, "learning_rate": 4.9100582475242004e-05, "loss": 0.7945, "num_input_tokens_seen": 6853984, "step": 11890 }, { "epoch": 1.7716711349419123, "grad_norm": 0.35683897137641907, "learning_rate": 4.909885415311811e-05, "loss": 0.7998, "num_input_tokens_seen": 6856416, "step": 11895 }, { "epoch": 1.7724158474828715, "grad_norm": 0.4256107211112976, "learning_rate": 4.9097124202491636e-05, "loss": 0.814, "num_input_tokens_seen": 6859456, "step": 11900 }, { "epoch": 1.7731605600238307, "grad_norm": 0.4751564860343933, "learning_rate": 4.9095392623479474e-05, "loss": 0.857, "num_input_tokens_seen": 6862208, "step": 11905 }, { "epoch": 1.77390527256479, "grad_norm": 0.495922714471817, "learning_rate": 4.909365941619866e-05, "loss": 0.8075, "num_input_tokens_seen": 6865152, "step": 11910 }, { "epoch": 1.7746499851057491, "grad_norm": 0.31619369983673096, "learning_rate": 4.909192458076628e-05, "loss": 0.8308, "num_input_tokens_seen": 6867776, "step": 11915 }, { "epoch": 1.7753946976467083, "grad_norm": 0.38241496682167053, "learning_rate": 4.9090188117299596e-05, "loss": 0.8316, "num_input_tokens_seen": 6870848, "step": 11920 }, { "epoch": 1.7761394101876675, "grad_norm": 0.37733274698257446, "learning_rate": 4.908845002591594e-05, "loss": 0.8288, "num_input_tokens_seen": 6873696, "step": 11925 }, { "epoch": 1.7768841227286267, "grad_norm": 0.34086373448371887, "learning_rate": 4.9086710306732775e-05, "loss": 0.8122, "num_input_tokens_seen": 6876928, "step": 11930 }, { "epoch": 1.777628835269586, "grad_norm": 0.611312985420227, "learning_rate": 4.908496895986765e-05, "loss": 0.8393, "num_input_tokens_seen": 6879584, "step": 11935 }, { "epoch": 1.7783735478105451, "grad_norm": 0.5505900979042053, "learning_rate": 4.908322598543825e-05, "loss": 0.7863, "num_input_tokens_seen": 6882784, "step": 11940 }, { "epoch": 1.7791182603515043, "grad_norm": 0.43598660826683044, "learning_rate": 4.908148138356235e-05, "loss": 0.8055, "num_input_tokens_seen": 6885632, "step": 11945 }, { "epoch": 1.7798629728924635, "grad_norm": 0.33298131823539734, "learning_rate": 4.907973515435784e-05, "loss": 0.8169, "num_input_tokens_seen": 6888736, "step": 11950 }, { "epoch": 1.7806076854334227, "grad_norm": 0.5005552768707275, "learning_rate": 4.907798729794274e-05, "loss": 0.8123, "num_input_tokens_seen": 6891840, "step": 11955 }, { "epoch": 1.781352397974382, "grad_norm": 0.43136289715766907, "learning_rate": 4.907623781443515e-05, "loss": 0.8169, "num_input_tokens_seen": 6894592, "step": 11960 }, { "epoch": 1.7820971105153411, "grad_norm": 0.4714043140411377, "learning_rate": 4.9074486703953295e-05, "loss": 0.7853, "num_input_tokens_seen": 6897376, "step": 11965 }, { "epoch": 1.7828418230563003, "grad_norm": 0.42456454038619995, "learning_rate": 4.9072733966615506e-05, "loss": 0.7792, "num_input_tokens_seen": 6900096, "step": 11970 }, { "epoch": 1.7835865355972595, "grad_norm": 0.4804864227771759, "learning_rate": 4.907097960254023e-05, "loss": 0.7949, "num_input_tokens_seen": 6903008, "step": 11975 }, { "epoch": 1.7843312481382188, "grad_norm": 0.4616246819496155, "learning_rate": 4.9069223611846014e-05, "loss": 0.8065, "num_input_tokens_seen": 6906144, "step": 11980 }, { "epoch": 1.785075960679178, "grad_norm": 0.46874701976776123, "learning_rate": 4.906746599465153e-05, "loss": 0.7986, "num_input_tokens_seen": 6909152, "step": 11985 }, { "epoch": 1.7858206732201372, "grad_norm": 0.43560126423835754, "learning_rate": 4.906570675107555e-05, "loss": 0.8252, "num_input_tokens_seen": 6912320, "step": 11990 }, { "epoch": 1.7865653857610964, "grad_norm": 0.4038674235343933, "learning_rate": 4.906394588123694e-05, "loss": 0.8218, "num_input_tokens_seen": 6915232, "step": 11995 }, { "epoch": 1.7873100983020556, "grad_norm": 0.42717644572257996, "learning_rate": 4.9062183385254714e-05, "loss": 0.7641, "num_input_tokens_seen": 6918048, "step": 12000 }, { "epoch": 1.7880548108430145, "grad_norm": 0.5449954867362976, "learning_rate": 4.9060419263247954e-05, "loss": 0.8076, "num_input_tokens_seen": 6920832, "step": 12005 }, { "epoch": 1.7887995233839737, "grad_norm": 0.5160726308822632, "learning_rate": 4.905865351533589e-05, "loss": 0.8073, "num_input_tokens_seen": 6923712, "step": 12010 }, { "epoch": 1.789544235924933, "grad_norm": 0.3419908285140991, "learning_rate": 4.905688614163784e-05, "loss": 0.7516, "num_input_tokens_seen": 6926432, "step": 12015 }, { "epoch": 1.7902889484658921, "grad_norm": 0.6879850625991821, "learning_rate": 4.905511714227322e-05, "loss": 0.8428, "num_input_tokens_seen": 6929440, "step": 12020 }, { "epoch": 1.7910336610068514, "grad_norm": 0.41247549653053284, "learning_rate": 4.905334651736159e-05, "loss": 0.7834, "num_input_tokens_seen": 6932608, "step": 12025 }, { "epoch": 1.7917783735478106, "grad_norm": 0.5583620667457581, "learning_rate": 4.90515742670226e-05, "loss": 0.8478, "num_input_tokens_seen": 6935616, "step": 12030 }, { "epoch": 1.7925230860887698, "grad_norm": 0.38799017667770386, "learning_rate": 4.904980039137601e-05, "loss": 0.7718, "num_input_tokens_seen": 6938400, "step": 12035 }, { "epoch": 1.793267798629729, "grad_norm": 0.4137113094329834, "learning_rate": 4.904802489054168e-05, "loss": 0.8494, "num_input_tokens_seen": 6941344, "step": 12040 }, { "epoch": 1.7940125111706882, "grad_norm": 0.3260999619960785, "learning_rate": 4.9046247764639606e-05, "loss": 0.8187, "num_input_tokens_seen": 6944032, "step": 12045 }, { "epoch": 1.7947572237116471, "grad_norm": 0.6559731960296631, "learning_rate": 4.9044469013789876e-05, "loss": 0.8162, "num_input_tokens_seen": 6947040, "step": 12050 }, { "epoch": 1.7955019362526063, "grad_norm": 0.46881020069122314, "learning_rate": 4.904268863811268e-05, "loss": 0.8219, "num_input_tokens_seen": 6949824, "step": 12055 }, { "epoch": 1.7962466487935655, "grad_norm": 0.43590816855430603, "learning_rate": 4.9040906637728344e-05, "loss": 0.8197, "num_input_tokens_seen": 6952768, "step": 12060 }, { "epoch": 1.7969913613345248, "grad_norm": 0.37122729420661926, "learning_rate": 4.903912301275728e-05, "loss": 0.8022, "num_input_tokens_seen": 6955616, "step": 12065 }, { "epoch": 1.797736073875484, "grad_norm": 0.3510664403438568, "learning_rate": 4.903733776332001e-05, "loss": 0.8152, "num_input_tokens_seen": 6958528, "step": 12070 }, { "epoch": 1.7984807864164432, "grad_norm": 0.4910581409931183, "learning_rate": 4.903555088953719e-05, "loss": 0.8284, "num_input_tokens_seen": 6961472, "step": 12075 }, { "epoch": 1.7992254989574024, "grad_norm": 0.4185062348842621, "learning_rate": 4.9033762391529556e-05, "loss": 0.8142, "num_input_tokens_seen": 6964512, "step": 12080 }, { "epoch": 1.7999702114983616, "grad_norm": 0.6053033471107483, "learning_rate": 4.903197226941798e-05, "loss": 0.7979, "num_input_tokens_seen": 6967456, "step": 12085 }, { "epoch": 1.8007149240393208, "grad_norm": 0.3558102250099182, "learning_rate": 4.9030180523323425e-05, "loss": 0.7806, "num_input_tokens_seen": 6970272, "step": 12090 }, { "epoch": 1.80145963658028, "grad_norm": 0.46487316489219666, "learning_rate": 4.902838715336697e-05, "loss": 0.7816, "num_input_tokens_seen": 6973120, "step": 12095 }, { "epoch": 1.8022043491212392, "grad_norm": 0.5111076831817627, "learning_rate": 4.90265921596698e-05, "loss": 0.8391, "num_input_tokens_seen": 6975904, "step": 12100 }, { "epoch": 1.8029490616621984, "grad_norm": 0.40873104333877563, "learning_rate": 4.9024795542353216e-05, "loss": 0.8105, "num_input_tokens_seen": 6979072, "step": 12105 }, { "epoch": 1.8036937742031576, "grad_norm": 0.3525361120700836, "learning_rate": 4.902299730153863e-05, "loss": 0.8409, "num_input_tokens_seen": 6982112, "step": 12110 }, { "epoch": 1.8044384867441168, "grad_norm": 0.3466850519180298, "learning_rate": 4.9021197437347555e-05, "loss": 0.7845, "num_input_tokens_seen": 6984960, "step": 12115 }, { "epoch": 1.805183199285076, "grad_norm": 0.3757795989513397, "learning_rate": 4.901939594990162e-05, "loss": 0.7944, "num_input_tokens_seen": 6987808, "step": 12120 }, { "epoch": 1.8059279118260352, "grad_norm": 0.34955817461013794, "learning_rate": 4.901759283932257e-05, "loss": 0.8215, "num_input_tokens_seen": 6990496, "step": 12125 }, { "epoch": 1.8066726243669944, "grad_norm": 0.3884846568107605, "learning_rate": 4.9015788105732236e-05, "loss": 0.7944, "num_input_tokens_seen": 6993120, "step": 12130 }, { "epoch": 1.8074173369079536, "grad_norm": 0.35936596989631653, "learning_rate": 4.9013981749252585e-05, "loss": 0.8122, "num_input_tokens_seen": 6995840, "step": 12135 }, { "epoch": 1.8081620494489128, "grad_norm": 0.37874460220336914, "learning_rate": 4.901217377000568e-05, "loss": 0.7881, "num_input_tokens_seen": 6998496, "step": 12140 }, { "epoch": 1.808906761989872, "grad_norm": 0.5279278755187988, "learning_rate": 4.90103641681137e-05, "loss": 0.8258, "num_input_tokens_seen": 7001376, "step": 12145 }, { "epoch": 1.8096514745308312, "grad_norm": 0.3375421166419983, "learning_rate": 4.900855294369893e-05, "loss": 0.8112, "num_input_tokens_seen": 7004544, "step": 12150 }, { "epoch": 1.8103961870717904, "grad_norm": 0.3934973478317261, "learning_rate": 4.900674009688376e-05, "loss": 0.8169, "num_input_tokens_seen": 7007360, "step": 12155 }, { "epoch": 1.8111408996127496, "grad_norm": 0.49722957611083984, "learning_rate": 4.90049256277907e-05, "loss": 0.785, "num_input_tokens_seen": 7010176, "step": 12160 }, { "epoch": 1.8118856121537088, "grad_norm": 0.48354604840278625, "learning_rate": 4.900310953654236e-05, "loss": 0.8132, "num_input_tokens_seen": 7013280, "step": 12165 }, { "epoch": 1.812630324694668, "grad_norm": 0.41997963190078735, "learning_rate": 4.900129182326147e-05, "loss": 0.8049, "num_input_tokens_seen": 7016032, "step": 12170 }, { "epoch": 1.8133750372356272, "grad_norm": 0.41262325644493103, "learning_rate": 4.899947248807086e-05, "loss": 0.8569, "num_input_tokens_seen": 7020128, "step": 12175 }, { "epoch": 1.8141197497765862, "grad_norm": 0.404056191444397, "learning_rate": 4.899765153109348e-05, "loss": 0.7997, "num_input_tokens_seen": 7023232, "step": 12180 }, { "epoch": 1.8148644623175454, "grad_norm": 0.30801746249198914, "learning_rate": 4.899582895245237e-05, "loss": 0.7993, "num_input_tokens_seen": 7026048, "step": 12185 }, { "epoch": 1.8156091748585046, "grad_norm": 0.3399561047554016, "learning_rate": 4.89940047522707e-05, "loss": 0.8317, "num_input_tokens_seen": 7028768, "step": 12190 }, { "epoch": 1.8163538873994638, "grad_norm": 0.39355698227882385, "learning_rate": 4.899217893067174e-05, "loss": 0.8077, "num_input_tokens_seen": 7031712, "step": 12195 }, { "epoch": 1.817098599940423, "grad_norm": 0.49031952023506165, "learning_rate": 4.8990351487778875e-05, "loss": 0.8186, "num_input_tokens_seen": 7034720, "step": 12200 }, { "epoch": 1.8178433124813822, "grad_norm": 0.40928831696510315, "learning_rate": 4.89885224237156e-05, "loss": 0.8059, "num_input_tokens_seen": 7037536, "step": 12205 }, { "epoch": 1.8185880250223414, "grad_norm": 0.5986888408660889, "learning_rate": 4.89866917386055e-05, "loss": 0.7951, "num_input_tokens_seen": 7040480, "step": 12210 }, { "epoch": 1.8193327375633006, "grad_norm": 0.32933109998703003, "learning_rate": 4.89848594325723e-05, "loss": 0.7848, "num_input_tokens_seen": 7043552, "step": 12215 }, { "epoch": 1.8200774501042598, "grad_norm": 0.4579000473022461, "learning_rate": 4.898302550573981e-05, "loss": 0.8046, "num_input_tokens_seen": 7046368, "step": 12220 }, { "epoch": 1.8208221626452188, "grad_norm": 0.4505487084388733, "learning_rate": 4.898118995823197e-05, "loss": 0.7654, "num_input_tokens_seen": 7049472, "step": 12225 }, { "epoch": 1.821566875186178, "grad_norm": 0.5308231711387634, "learning_rate": 4.8979352790172814e-05, "loss": 0.811, "num_input_tokens_seen": 7052000, "step": 12230 }, { "epoch": 1.8223115877271372, "grad_norm": 0.5157550573348999, "learning_rate": 4.8977514001686485e-05, "loss": 0.8222, "num_input_tokens_seen": 7054880, "step": 12235 }, { "epoch": 1.8230563002680964, "grad_norm": 0.4351015090942383, "learning_rate": 4.8975673592897244e-05, "loss": 0.8343, "num_input_tokens_seen": 7057824, "step": 12240 }, { "epoch": 1.8238010128090556, "grad_norm": 0.3845003545284271, "learning_rate": 4.897383156392947e-05, "loss": 0.8287, "num_input_tokens_seen": 7060608, "step": 12245 }, { "epoch": 1.8245457253500148, "grad_norm": 0.4559733271598816, "learning_rate": 4.897198791490762e-05, "loss": 0.7663, "num_input_tokens_seen": 7063872, "step": 12250 }, { "epoch": 1.825290437890974, "grad_norm": 0.47663596272468567, "learning_rate": 4.897014264595629e-05, "loss": 0.8263, "num_input_tokens_seen": 7066560, "step": 12255 }, { "epoch": 1.8260351504319332, "grad_norm": 0.5039823651313782, "learning_rate": 4.896829575720018e-05, "loss": 0.8144, "num_input_tokens_seen": 7069408, "step": 12260 }, { "epoch": 1.8267798629728924, "grad_norm": 0.4127817451953888, "learning_rate": 4.8966447248764084e-05, "loss": 0.8195, "num_input_tokens_seen": 7072192, "step": 12265 }, { "epoch": 1.8275245755138516, "grad_norm": 0.5018987059593201, "learning_rate": 4.8964597120772926e-05, "loss": 0.7947, "num_input_tokens_seen": 7075200, "step": 12270 }, { "epoch": 1.8282692880548108, "grad_norm": 0.4556502103805542, "learning_rate": 4.8962745373351734e-05, "loss": 0.8022, "num_input_tokens_seen": 7077952, "step": 12275 }, { "epoch": 1.82901400059577, "grad_norm": 0.31034162640571594, "learning_rate": 4.8960892006625626e-05, "loss": 0.7962, "num_input_tokens_seen": 7080736, "step": 12280 }, { "epoch": 1.8297587131367292, "grad_norm": 0.3700774312019348, "learning_rate": 4.8959037020719854e-05, "loss": 0.7769, "num_input_tokens_seen": 7083680, "step": 12285 }, { "epoch": 1.8305034256776884, "grad_norm": 0.39990776777267456, "learning_rate": 4.895718041575978e-05, "loss": 0.8229, "num_input_tokens_seen": 7086464, "step": 12290 }, { "epoch": 1.8312481382186476, "grad_norm": 0.45517662167549133, "learning_rate": 4.895532219187085e-05, "loss": 0.7756, "num_input_tokens_seen": 7089568, "step": 12295 }, { "epoch": 1.8319928507596068, "grad_norm": 0.506677508354187, "learning_rate": 4.895346234917865e-05, "loss": 0.8088, "num_input_tokens_seen": 7092640, "step": 12300 }, { "epoch": 1.832737563300566, "grad_norm": 0.44880062341690063, "learning_rate": 4.8951600887808836e-05, "loss": 0.7912, "num_input_tokens_seen": 7095392, "step": 12305 }, { "epoch": 1.8334822758415252, "grad_norm": 0.3592306077480316, "learning_rate": 4.894973780788722e-05, "loss": 0.8234, "num_input_tokens_seen": 7098304, "step": 12310 }, { "epoch": 1.8342269883824844, "grad_norm": 0.395134836435318, "learning_rate": 4.89478731095397e-05, "loss": 0.8188, "num_input_tokens_seen": 7101056, "step": 12315 }, { "epoch": 1.8349717009234436, "grad_norm": 0.4645416736602783, "learning_rate": 4.894600679289228e-05, "loss": 0.8055, "num_input_tokens_seen": 7104064, "step": 12320 }, { "epoch": 1.8357164134644028, "grad_norm": 0.4557018280029297, "learning_rate": 4.8944138858071076e-05, "loss": 0.806, "num_input_tokens_seen": 7107104, "step": 12325 }, { "epoch": 1.836461126005362, "grad_norm": 0.3226293921470642, "learning_rate": 4.894226930520232e-05, "loss": 0.7838, "num_input_tokens_seen": 7110080, "step": 12330 }, { "epoch": 1.8372058385463212, "grad_norm": 0.3251435160636902, "learning_rate": 4.894039813441235e-05, "loss": 0.8299, "num_input_tokens_seen": 7112992, "step": 12335 }, { "epoch": 1.8379505510872804, "grad_norm": 0.42450159788131714, "learning_rate": 4.89385253458276e-05, "loss": 0.8473, "num_input_tokens_seen": 7115968, "step": 12340 }, { "epoch": 1.8386952636282397, "grad_norm": 0.5002928376197815, "learning_rate": 4.8936650939574636e-05, "loss": 0.8276, "num_input_tokens_seen": 7119040, "step": 12345 }, { "epoch": 1.8394399761691989, "grad_norm": 0.38285142183303833, "learning_rate": 4.893477491578013e-05, "loss": 0.7742, "num_input_tokens_seen": 7122016, "step": 12350 }, { "epoch": 1.8401846887101578, "grad_norm": 0.4310658872127533, "learning_rate": 4.893289727457083e-05, "loss": 0.8096, "num_input_tokens_seen": 7124832, "step": 12355 }, { "epoch": 1.840929401251117, "grad_norm": 0.43433380126953125, "learning_rate": 4.893101801607365e-05, "loss": 0.7873, "num_input_tokens_seen": 7127808, "step": 12360 }, { "epoch": 1.8416741137920762, "grad_norm": 0.4476326107978821, "learning_rate": 4.892913714041556e-05, "loss": 0.7997, "num_input_tokens_seen": 7130880, "step": 12365 }, { "epoch": 1.8424188263330354, "grad_norm": 0.5539685487747192, "learning_rate": 4.892725464772368e-05, "loss": 0.8085, "num_input_tokens_seen": 7133856, "step": 12370 }, { "epoch": 1.8431635388739946, "grad_norm": 0.3873386085033417, "learning_rate": 4.8925370538125204e-05, "loss": 0.8081, "num_input_tokens_seen": 7136768, "step": 12375 }, { "epoch": 1.8439082514149538, "grad_norm": 0.3820304274559021, "learning_rate": 4.892348481174747e-05, "loss": 0.7972, "num_input_tokens_seen": 7139680, "step": 12380 }, { "epoch": 1.844652963955913, "grad_norm": 0.4176015257835388, "learning_rate": 4.8921597468717887e-05, "loss": 0.8346, "num_input_tokens_seen": 7142560, "step": 12385 }, { "epoch": 1.8453976764968723, "grad_norm": 0.35791903734207153, "learning_rate": 4.891970850916401e-05, "loss": 0.7969, "num_input_tokens_seen": 7145248, "step": 12390 }, { "epoch": 1.8461423890378312, "grad_norm": 0.43607422709465027, "learning_rate": 4.891781793321348e-05, "loss": 0.7731, "num_input_tokens_seen": 7148064, "step": 12395 }, { "epoch": 1.8468871015787904, "grad_norm": 0.3774760365486145, "learning_rate": 4.8915925740994064e-05, "loss": 0.8158, "num_input_tokens_seen": 7151136, "step": 12400 }, { "epoch": 1.8476318141197496, "grad_norm": 0.4830952286720276, "learning_rate": 4.8914031932633613e-05, "loss": 0.8095, "num_input_tokens_seen": 7154016, "step": 12405 }, { "epoch": 1.8483765266607088, "grad_norm": 0.5251094698905945, "learning_rate": 4.891213650826012e-05, "loss": 0.824, "num_input_tokens_seen": 7156960, "step": 12410 }, { "epoch": 1.849121239201668, "grad_norm": 0.5012455582618713, "learning_rate": 4.891023946800165e-05, "loss": 0.7689, "num_input_tokens_seen": 7159872, "step": 12415 }, { "epoch": 1.8498659517426272, "grad_norm": 0.40076586604118347, "learning_rate": 4.890834081198642e-05, "loss": 0.8566, "num_input_tokens_seen": 7162880, "step": 12420 }, { "epoch": 1.8506106642835864, "grad_norm": 0.4129228889942169, "learning_rate": 4.890644054034271e-05, "loss": 0.8457, "num_input_tokens_seen": 7165824, "step": 12425 }, { "epoch": 1.8513553768245457, "grad_norm": 0.7128233909606934, "learning_rate": 4.890453865319896e-05, "loss": 0.8437, "num_input_tokens_seen": 7168544, "step": 12430 }, { "epoch": 1.8521000893655049, "grad_norm": 0.5071356892585754, "learning_rate": 4.890263515068367e-05, "loss": 0.7915, "num_input_tokens_seen": 7171200, "step": 12435 }, { "epoch": 1.852844801906464, "grad_norm": 0.28643354773521423, "learning_rate": 4.890073003292547e-05, "loss": 0.784, "num_input_tokens_seen": 7173952, "step": 12440 }, { "epoch": 1.8535895144474233, "grad_norm": 0.4221675395965576, "learning_rate": 4.8898823300053124e-05, "loss": 0.823, "num_input_tokens_seen": 7176704, "step": 12445 }, { "epoch": 1.8543342269883825, "grad_norm": 0.4689047932624817, "learning_rate": 4.889691495219545e-05, "loss": 0.7935, "num_input_tokens_seen": 7179552, "step": 12450 }, { "epoch": 1.8550789395293417, "grad_norm": 0.292390912771225, "learning_rate": 4.889500498948143e-05, "loss": 0.8123, "num_input_tokens_seen": 7182240, "step": 12455 }, { "epoch": 1.8558236520703009, "grad_norm": 0.5989458560943604, "learning_rate": 4.8893093412040114e-05, "loss": 0.8237, "num_input_tokens_seen": 7184960, "step": 12460 }, { "epoch": 1.85656836461126, "grad_norm": 0.4049832224845886, "learning_rate": 4.8891180220000696e-05, "loss": 0.8198, "num_input_tokens_seen": 7187808, "step": 12465 }, { "epoch": 1.8573130771522193, "grad_norm": 0.4795302450656891, "learning_rate": 4.8889265413492446e-05, "loss": 0.8031, "num_input_tokens_seen": 7190592, "step": 12470 }, { "epoch": 1.8580577896931785, "grad_norm": 0.3666209578514099, "learning_rate": 4.888734899264477e-05, "loss": 0.8314, "num_input_tokens_seen": 7193408, "step": 12475 }, { "epoch": 1.8588025022341377, "grad_norm": 0.4081287384033203, "learning_rate": 4.888543095758717e-05, "loss": 0.8122, "num_input_tokens_seen": 7196384, "step": 12480 }, { "epoch": 1.8595472147750969, "grad_norm": 0.498328298330307, "learning_rate": 4.888351130844926e-05, "loss": 0.7928, "num_input_tokens_seen": 7199168, "step": 12485 }, { "epoch": 1.860291927316056, "grad_norm": 0.5469213724136353, "learning_rate": 4.8881590045360744e-05, "loss": 0.8031, "num_input_tokens_seen": 7202208, "step": 12490 }, { "epoch": 1.8610366398570153, "grad_norm": 0.4315745532512665, "learning_rate": 4.8879667168451484e-05, "loss": 0.8087, "num_input_tokens_seen": 7204864, "step": 12495 }, { "epoch": 1.8617813523979745, "grad_norm": 0.34986478090286255, "learning_rate": 4.88777426778514e-05, "loss": 0.8499, "num_input_tokens_seen": 7207616, "step": 12500 }, { "epoch": 1.8625260649389337, "grad_norm": 0.4086403250694275, "learning_rate": 4.8875816573690544e-05, "loss": 0.7734, "num_input_tokens_seen": 7210496, "step": 12505 }, { "epoch": 1.863270777479893, "grad_norm": 0.4111282527446747, "learning_rate": 4.887388885609907e-05, "loss": 0.8886, "num_input_tokens_seen": 7213280, "step": 12510 }, { "epoch": 1.864015490020852, "grad_norm": 0.6142333149909973, "learning_rate": 4.887195952520726e-05, "loss": 0.8423, "num_input_tokens_seen": 7216256, "step": 12515 }, { "epoch": 1.8647602025618113, "grad_norm": 0.4197894334793091, "learning_rate": 4.887002858114548e-05, "loss": 0.8296, "num_input_tokens_seen": 7219040, "step": 12520 }, { "epoch": 1.8655049151027703, "grad_norm": 0.480630487203598, "learning_rate": 4.886809602404422e-05, "loss": 0.8229, "num_input_tokens_seen": 7221824, "step": 12525 }, { "epoch": 1.8662496276437295, "grad_norm": 0.5853443145751953, "learning_rate": 4.8866161854034064e-05, "loss": 0.8183, "num_input_tokens_seen": 7224832, "step": 12530 }, { "epoch": 1.8669943401846887, "grad_norm": 0.4354136288166046, "learning_rate": 4.886422607124572e-05, "loss": 0.7938, "num_input_tokens_seen": 7227840, "step": 12535 }, { "epoch": 1.8677390527256479, "grad_norm": 0.463733434677124, "learning_rate": 4.886228867581002e-05, "loss": 0.7871, "num_input_tokens_seen": 7231040, "step": 12540 }, { "epoch": 1.868483765266607, "grad_norm": 0.4557051658630371, "learning_rate": 4.886034966785785e-05, "loss": 0.8439, "num_input_tokens_seen": 7233888, "step": 12545 }, { "epoch": 1.8692284778075663, "grad_norm": 0.43587470054626465, "learning_rate": 4.8858409047520274e-05, "loss": 0.8398, "num_input_tokens_seen": 7236480, "step": 12550 }, { "epoch": 1.8699731903485255, "grad_norm": 0.4301174581050873, "learning_rate": 4.88564668149284e-05, "loss": 0.8043, "num_input_tokens_seen": 7239456, "step": 12555 }, { "epoch": 1.8707179028894847, "grad_norm": 0.5131571888923645, "learning_rate": 4.88545229702135e-05, "loss": 0.8294, "num_input_tokens_seen": 7242848, "step": 12560 }, { "epoch": 1.871462615430444, "grad_norm": 0.4802038371562958, "learning_rate": 4.8852577513506925e-05, "loss": 0.8318, "num_input_tokens_seen": 7245664, "step": 12565 }, { "epoch": 1.8722073279714029, "grad_norm": 0.3347416818141937, "learning_rate": 4.885063044494014e-05, "loss": 0.8228, "num_input_tokens_seen": 7248224, "step": 12570 }, { "epoch": 1.872952040512362, "grad_norm": 0.3363347053527832, "learning_rate": 4.884868176464471e-05, "loss": 0.8029, "num_input_tokens_seen": 7251008, "step": 12575 }, { "epoch": 1.8736967530533213, "grad_norm": 0.3744014799594879, "learning_rate": 4.8846731472752336e-05, "loss": 0.8021, "num_input_tokens_seen": 7253984, "step": 12580 }, { "epoch": 1.8744414655942805, "grad_norm": 0.3466411828994751, "learning_rate": 4.8844779569394805e-05, "loss": 0.8154, "num_input_tokens_seen": 7256736, "step": 12585 }, { "epoch": 1.8751861781352397, "grad_norm": 0.4776768386363983, "learning_rate": 4.884282605470401e-05, "loss": 0.8448, "num_input_tokens_seen": 7259424, "step": 12590 }, { "epoch": 1.875930890676199, "grad_norm": 0.3699803948402405, "learning_rate": 4.8840870928811966e-05, "loss": 0.8028, "num_input_tokens_seen": 7262240, "step": 12595 }, { "epoch": 1.876675603217158, "grad_norm": 0.523378312587738, "learning_rate": 4.8838914191850804e-05, "loss": 0.809, "num_input_tokens_seen": 7264736, "step": 12600 }, { "epoch": 1.8774203157581173, "grad_norm": 0.33963897824287415, "learning_rate": 4.883695584395274e-05, "loss": 0.7748, "num_input_tokens_seen": 7267488, "step": 12605 }, { "epoch": 1.8781650282990765, "grad_norm": 0.5716784000396729, "learning_rate": 4.883499588525011e-05, "loss": 0.8143, "num_input_tokens_seen": 7270240, "step": 12610 }, { "epoch": 1.8789097408400357, "grad_norm": 0.40856021642684937, "learning_rate": 4.883303431587536e-05, "loss": 0.7927, "num_input_tokens_seen": 7273216, "step": 12615 }, { "epoch": 1.879654453380995, "grad_norm": 0.38375696539878845, "learning_rate": 4.883107113596106e-05, "loss": 0.7992, "num_input_tokens_seen": 7275808, "step": 12620 }, { "epoch": 1.880399165921954, "grad_norm": 0.3774520456790924, "learning_rate": 4.882910634563985e-05, "loss": 0.8704, "num_input_tokens_seen": 7278688, "step": 12625 }, { "epoch": 1.8811438784629133, "grad_norm": 0.38164418935775757, "learning_rate": 4.882713994504453e-05, "loss": 0.8075, "num_input_tokens_seen": 7281984, "step": 12630 }, { "epoch": 1.8818885910038725, "grad_norm": 0.4880787432193756, "learning_rate": 4.882517193430796e-05, "loss": 0.8262, "num_input_tokens_seen": 7284992, "step": 12635 }, { "epoch": 1.8826333035448317, "grad_norm": 0.2941092848777771, "learning_rate": 4.882320231356313e-05, "loss": 0.7991, "num_input_tokens_seen": 7287872, "step": 12640 }, { "epoch": 1.883378016085791, "grad_norm": 0.343670129776001, "learning_rate": 4.882123108294316e-05, "loss": 0.7913, "num_input_tokens_seen": 7290688, "step": 12645 }, { "epoch": 1.8841227286267501, "grad_norm": 0.39752423763275146, "learning_rate": 4.881925824258123e-05, "loss": 0.8546, "num_input_tokens_seen": 7293504, "step": 12650 }, { "epoch": 1.8848674411677093, "grad_norm": 0.40007251501083374, "learning_rate": 4.881728379261068e-05, "loss": 0.7836, "num_input_tokens_seen": 7296384, "step": 12655 }, { "epoch": 1.8856121537086685, "grad_norm": 0.377918541431427, "learning_rate": 4.881530773316492e-05, "loss": 0.7881, "num_input_tokens_seen": 7299520, "step": 12660 }, { "epoch": 1.8863568662496277, "grad_norm": 0.5118857026100159, "learning_rate": 4.881333006437749e-05, "loss": 0.7876, "num_input_tokens_seen": 7302240, "step": 12665 }, { "epoch": 1.887101578790587, "grad_norm": 0.4243350923061371, "learning_rate": 4.881135078638203e-05, "loss": 0.7933, "num_input_tokens_seen": 7305312, "step": 12670 }, { "epoch": 1.8878462913315461, "grad_norm": 0.4396982789039612, "learning_rate": 4.88093698993123e-05, "loss": 0.8217, "num_input_tokens_seen": 7308192, "step": 12675 }, { "epoch": 1.8885910038725053, "grad_norm": 0.3637829124927521, "learning_rate": 4.880738740330215e-05, "loss": 0.7988, "num_input_tokens_seen": 7310944, "step": 12680 }, { "epoch": 1.8893357164134645, "grad_norm": 0.4321799576282501, "learning_rate": 4.8805403298485554e-05, "loss": 0.8091, "num_input_tokens_seen": 7313856, "step": 12685 }, { "epoch": 1.8900804289544237, "grad_norm": 0.33994749188423157, "learning_rate": 4.8803417584996584e-05, "loss": 0.7896, "num_input_tokens_seen": 7316608, "step": 12690 }, { "epoch": 1.890825141495383, "grad_norm": 0.48821011185646057, "learning_rate": 4.880143026296944e-05, "loss": 0.8081, "num_input_tokens_seen": 7319648, "step": 12695 }, { "epoch": 1.891569854036342, "grad_norm": 0.34043291211128235, "learning_rate": 4.87994413325384e-05, "loss": 0.8031, "num_input_tokens_seen": 7322368, "step": 12700 }, { "epoch": 1.8923145665773011, "grad_norm": 0.4867752492427826, "learning_rate": 4.879745079383789e-05, "loss": 0.8345, "num_input_tokens_seen": 7325376, "step": 12705 }, { "epoch": 1.8930592791182603, "grad_norm": 0.3261461853981018, "learning_rate": 4.879545864700239e-05, "loss": 0.8002, "num_input_tokens_seen": 7328096, "step": 12710 }, { "epoch": 1.8938039916592195, "grad_norm": 0.3728936016559601, "learning_rate": 4.879346489216655e-05, "loss": 0.8247, "num_input_tokens_seen": 7330816, "step": 12715 }, { "epoch": 1.8945487042001787, "grad_norm": 0.4142448604106903, "learning_rate": 4.8791469529465087e-05, "loss": 0.7724, "num_input_tokens_seen": 7333504, "step": 12720 }, { "epoch": 1.895293416741138, "grad_norm": 0.4290542006492615, "learning_rate": 4.878947255903284e-05, "loss": 0.8259, "num_input_tokens_seen": 7336480, "step": 12725 }, { "epoch": 1.8960381292820971, "grad_norm": 0.46813279390335083, "learning_rate": 4.878747398100477e-05, "loss": 0.7768, "num_input_tokens_seen": 7339648, "step": 12730 }, { "epoch": 1.8967828418230563, "grad_norm": 0.3464159667491913, "learning_rate": 4.878547379551591e-05, "loss": 0.7963, "num_input_tokens_seen": 7342592, "step": 12735 }, { "epoch": 1.8975275543640155, "grad_norm": 0.4160962700843811, "learning_rate": 4.8783472002701434e-05, "loss": 0.7842, "num_input_tokens_seen": 7345536, "step": 12740 }, { "epoch": 1.8982722669049745, "grad_norm": 0.310893714427948, "learning_rate": 4.8781468602696623e-05, "loss": 0.7783, "num_input_tokens_seen": 7348160, "step": 12745 }, { "epoch": 1.8990169794459337, "grad_norm": 0.3807593882083893, "learning_rate": 4.8779463595636857e-05, "loss": 0.7577, "num_input_tokens_seen": 7351136, "step": 12750 }, { "epoch": 1.899761691986893, "grad_norm": 0.39870625734329224, "learning_rate": 4.877745698165761e-05, "loss": 0.8113, "num_input_tokens_seen": 7354272, "step": 12755 }, { "epoch": 1.9005064045278521, "grad_norm": 0.4427226185798645, "learning_rate": 4.87754487608945e-05, "loss": 0.8427, "num_input_tokens_seen": 7357056, "step": 12760 }, { "epoch": 1.9012511170688113, "grad_norm": 0.38619476556777954, "learning_rate": 4.8773438933483224e-05, "loss": 0.8457, "num_input_tokens_seen": 7359904, "step": 12765 }, { "epoch": 1.9019958296097705, "grad_norm": 0.5306235551834106, "learning_rate": 4.8771427499559594e-05, "loss": 0.8083, "num_input_tokens_seen": 7362720, "step": 12770 }, { "epoch": 1.9027405421507297, "grad_norm": 0.39400237798690796, "learning_rate": 4.8769414459259556e-05, "loss": 0.8307, "num_input_tokens_seen": 7365696, "step": 12775 }, { "epoch": 1.903485254691689, "grad_norm": 0.44745415449142456, "learning_rate": 4.8767399812719115e-05, "loss": 0.764, "num_input_tokens_seen": 7368544, "step": 12780 }, { "epoch": 1.9042299672326481, "grad_norm": 0.4080175459384918, "learning_rate": 4.876538356007443e-05, "loss": 0.7796, "num_input_tokens_seen": 7371520, "step": 12785 }, { "epoch": 1.9049746797736073, "grad_norm": 0.3591352701187134, "learning_rate": 4.876336570146175e-05, "loss": 0.7886, "num_input_tokens_seen": 7374304, "step": 12790 }, { "epoch": 1.9057193923145666, "grad_norm": 0.4345097839832306, "learning_rate": 4.876134623701743e-05, "loss": 0.7893, "num_input_tokens_seen": 7377280, "step": 12795 }, { "epoch": 1.9064641048555258, "grad_norm": 0.340316504240036, "learning_rate": 4.875932516687793e-05, "loss": 0.781, "num_input_tokens_seen": 7380192, "step": 12800 }, { "epoch": 1.907208817396485, "grad_norm": 0.4103732705116272, "learning_rate": 4.8757302491179844e-05, "loss": 0.8565, "num_input_tokens_seen": 7382976, "step": 12805 }, { "epoch": 1.9079535299374442, "grad_norm": 0.3195495307445526, "learning_rate": 4.8755278210059845e-05, "loss": 0.8495, "num_input_tokens_seen": 7385696, "step": 12810 }, { "epoch": 1.9086982424784034, "grad_norm": 0.5108408331871033, "learning_rate": 4.8753252323654726e-05, "loss": 0.7834, "num_input_tokens_seen": 7388640, "step": 12815 }, { "epoch": 1.9094429550193626, "grad_norm": 0.36561092734336853, "learning_rate": 4.8751224832101383e-05, "loss": 0.8153, "num_input_tokens_seen": 7391488, "step": 12820 }, { "epoch": 1.9101876675603218, "grad_norm": 0.4874470829963684, "learning_rate": 4.874919573553683e-05, "loss": 0.7674, "num_input_tokens_seen": 7394080, "step": 12825 }, { "epoch": 1.910932380101281, "grad_norm": 0.3370838463306427, "learning_rate": 4.8747165034098196e-05, "loss": 0.7954, "num_input_tokens_seen": 7396672, "step": 12830 }, { "epoch": 1.9116770926422402, "grad_norm": 0.593833863735199, "learning_rate": 4.8745132727922696e-05, "loss": 0.7969, "num_input_tokens_seen": 7399488, "step": 12835 }, { "epoch": 1.9124218051831994, "grad_norm": 0.3847464919090271, "learning_rate": 4.874309881714766e-05, "loss": 0.795, "num_input_tokens_seen": 7402496, "step": 12840 }, { "epoch": 1.9131665177241586, "grad_norm": 0.5895423889160156, "learning_rate": 4.874106330191055e-05, "loss": 0.8614, "num_input_tokens_seen": 7405216, "step": 12845 }, { "epoch": 1.9139112302651178, "grad_norm": 0.30493584275245667, "learning_rate": 4.8739026182348894e-05, "loss": 0.836, "num_input_tokens_seen": 7408288, "step": 12850 }, { "epoch": 1.914655942806077, "grad_norm": 0.35207241773605347, "learning_rate": 4.873698745860037e-05, "loss": 0.8227, "num_input_tokens_seen": 7411456, "step": 12855 }, { "epoch": 1.9154006553470362, "grad_norm": 0.36475396156311035, "learning_rate": 4.873494713080274e-05, "loss": 0.7604, "num_input_tokens_seen": 7414336, "step": 12860 }, { "epoch": 1.9161453678879954, "grad_norm": 0.47758856415748596, "learning_rate": 4.8732905199093884e-05, "loss": 0.8283, "num_input_tokens_seen": 7417152, "step": 12865 }, { "epoch": 1.9168900804289544, "grad_norm": 0.40598785877227783, "learning_rate": 4.873086166361178e-05, "loss": 0.8236, "num_input_tokens_seen": 7420096, "step": 12870 }, { "epoch": 1.9176347929699136, "grad_norm": 0.4300172030925751, "learning_rate": 4.872881652449453e-05, "loss": 0.7968, "num_input_tokens_seen": 7422976, "step": 12875 }, { "epoch": 1.9183795055108728, "grad_norm": 0.5026722550392151, "learning_rate": 4.872676978188033e-05, "loss": 0.8327, "num_input_tokens_seen": 7426112, "step": 12880 }, { "epoch": 1.919124218051832, "grad_norm": 0.49363651871681213, "learning_rate": 4.8724721435907504e-05, "loss": 0.827, "num_input_tokens_seen": 7428960, "step": 12885 }, { "epoch": 1.9198689305927912, "grad_norm": 0.3513845205307007, "learning_rate": 4.8722671486714457e-05, "loss": 0.8277, "num_input_tokens_seen": 7431616, "step": 12890 }, { "epoch": 1.9206136431337504, "grad_norm": 0.3575337529182434, "learning_rate": 4.8720619934439715e-05, "loss": 0.8646, "num_input_tokens_seen": 7434304, "step": 12895 }, { "epoch": 1.9213583556747096, "grad_norm": 0.38860267400741577, "learning_rate": 4.871856677922193e-05, "loss": 0.8143, "num_input_tokens_seen": 7437120, "step": 12900 }, { "epoch": 1.9221030682156688, "grad_norm": 0.3222694993019104, "learning_rate": 4.8716512021199825e-05, "loss": 0.7792, "num_input_tokens_seen": 7440064, "step": 12905 }, { "epoch": 1.922847780756628, "grad_norm": 0.3547894060611725, "learning_rate": 4.871445566051226e-05, "loss": 0.7878, "num_input_tokens_seen": 7442880, "step": 12910 }, { "epoch": 1.923592493297587, "grad_norm": 0.4336429238319397, "learning_rate": 4.8712397697298207e-05, "loss": 0.8194, "num_input_tokens_seen": 7445792, "step": 12915 }, { "epoch": 1.9243372058385462, "grad_norm": 0.3722597062587738, "learning_rate": 4.871033813169672e-05, "loss": 0.7844, "num_input_tokens_seen": 7448960, "step": 12920 }, { "epoch": 1.9250819183795054, "grad_norm": 0.37659159302711487, "learning_rate": 4.870827696384698e-05, "loss": 0.7918, "num_input_tokens_seen": 7451840, "step": 12925 }, { "epoch": 1.9258266309204646, "grad_norm": 0.3951700031757355, "learning_rate": 4.870621419388828e-05, "loss": 0.8152, "num_input_tokens_seen": 7454560, "step": 12930 }, { "epoch": 1.9265713434614238, "grad_norm": 0.3814238905906677, "learning_rate": 4.870414982196e-05, "loss": 0.8223, "num_input_tokens_seen": 7457344, "step": 12935 }, { "epoch": 1.927316056002383, "grad_norm": 0.46190381050109863, "learning_rate": 4.870208384820165e-05, "loss": 0.8275, "num_input_tokens_seen": 7460128, "step": 12940 }, { "epoch": 1.9280607685433422, "grad_norm": 0.40259283781051636, "learning_rate": 4.8700016272752844e-05, "loss": 0.7988, "num_input_tokens_seen": 7463072, "step": 12945 }, { "epoch": 1.9288054810843014, "grad_norm": 0.3571394681930542, "learning_rate": 4.869794709575329e-05, "loss": 0.8204, "num_input_tokens_seen": 7466048, "step": 12950 }, { "epoch": 1.9295501936252606, "grad_norm": 0.45566922426223755, "learning_rate": 4.869587631734282e-05, "loss": 0.8386, "num_input_tokens_seen": 7469152, "step": 12955 }, { "epoch": 1.9302949061662198, "grad_norm": 0.39996132254600525, "learning_rate": 4.869380393766137e-05, "loss": 0.8244, "num_input_tokens_seen": 7471776, "step": 12960 }, { "epoch": 1.931039618707179, "grad_norm": 0.4355395436286926, "learning_rate": 4.8691729956848986e-05, "loss": 0.8091, "num_input_tokens_seen": 7474656, "step": 12965 }, { "epoch": 1.9317843312481382, "grad_norm": 0.39419251680374146, "learning_rate": 4.868965437504581e-05, "loss": 0.8215, "num_input_tokens_seen": 7477760, "step": 12970 }, { "epoch": 1.9325290437890974, "grad_norm": 0.443518728017807, "learning_rate": 4.868757719239211e-05, "loss": 0.7913, "num_input_tokens_seen": 7480416, "step": 12975 }, { "epoch": 1.9332737563300566, "grad_norm": 0.43925824761390686, "learning_rate": 4.8685498409028254e-05, "loss": 0.818, "num_input_tokens_seen": 7483168, "step": 12980 }, { "epoch": 1.9340184688710158, "grad_norm": 0.416051983833313, "learning_rate": 4.8683418025094704e-05, "loss": 0.783, "num_input_tokens_seen": 7486144, "step": 12985 }, { "epoch": 1.934763181411975, "grad_norm": 0.264786034822464, "learning_rate": 4.8681336040732055e-05, "loss": 0.8311, "num_input_tokens_seen": 7489024, "step": 12990 }, { "epoch": 1.9355078939529342, "grad_norm": 0.3671684265136719, "learning_rate": 4.8679252456081e-05, "loss": 0.8322, "num_input_tokens_seen": 7491648, "step": 12995 }, { "epoch": 1.9362526064938934, "grad_norm": 0.3730667233467102, "learning_rate": 4.8677167271282344e-05, "loss": 0.8008, "num_input_tokens_seen": 7494560, "step": 13000 }, { "epoch": 1.9369973190348526, "grad_norm": 0.40204715728759766, "learning_rate": 4.867508048647698e-05, "loss": 0.8187, "num_input_tokens_seen": 7497312, "step": 13005 }, { "epoch": 1.9377420315758118, "grad_norm": 0.47725552320480347, "learning_rate": 4.867299210180593e-05, "loss": 0.8102, "num_input_tokens_seen": 7500352, "step": 13010 }, { "epoch": 1.938486744116771, "grad_norm": 0.3249785006046295, "learning_rate": 4.867090211741033e-05, "loss": 0.8224, "num_input_tokens_seen": 7503552, "step": 13015 }, { "epoch": 1.9392314566577302, "grad_norm": 0.3879041075706482, "learning_rate": 4.86688105334314e-05, "loss": 0.7863, "num_input_tokens_seen": 7506528, "step": 13020 }, { "epoch": 1.9399761691986894, "grad_norm": 0.3463902771472931, "learning_rate": 4.866671735001048e-05, "loss": 0.8064, "num_input_tokens_seen": 7509120, "step": 13025 }, { "epoch": 1.9407208817396486, "grad_norm": 0.478020042181015, "learning_rate": 4.866462256728902e-05, "loss": 0.802, "num_input_tokens_seen": 7512224, "step": 13030 }, { "epoch": 1.9414655942806078, "grad_norm": 0.33929404616355896, "learning_rate": 4.8662526185408595e-05, "loss": 0.8079, "num_input_tokens_seen": 7514880, "step": 13035 }, { "epoch": 1.942210306821567, "grad_norm": 0.4363448917865753, "learning_rate": 4.866042820451084e-05, "loss": 0.8347, "num_input_tokens_seen": 7518080, "step": 13040 }, { "epoch": 1.942955019362526, "grad_norm": 0.5030380487442017, "learning_rate": 4.865832862473756e-05, "loss": 0.8233, "num_input_tokens_seen": 7520992, "step": 13045 }, { "epoch": 1.9436997319034852, "grad_norm": 0.4132351577281952, "learning_rate": 4.865622744623061e-05, "loss": 0.7785, "num_input_tokens_seen": 7523520, "step": 13050 }, { "epoch": 1.9444444444444444, "grad_norm": 0.4875341057777405, "learning_rate": 4.8654124669131984e-05, "loss": 0.8248, "num_input_tokens_seen": 7526112, "step": 13055 }, { "epoch": 1.9451891569854036, "grad_norm": 0.3216966688632965, "learning_rate": 4.865202029358379e-05, "loss": 0.8101, "num_input_tokens_seen": 7529120, "step": 13060 }, { "epoch": 1.9459338695263628, "grad_norm": 0.356393039226532, "learning_rate": 4.864991431972822e-05, "loss": 0.7834, "num_input_tokens_seen": 7531904, "step": 13065 }, { "epoch": 1.946678582067322, "grad_norm": 0.418760746717453, "learning_rate": 4.86478067477076e-05, "loss": 0.8275, "num_input_tokens_seen": 7534656, "step": 13070 }, { "epoch": 1.9474232946082812, "grad_norm": 0.328087717294693, "learning_rate": 4.8645697577664347e-05, "loss": 0.812, "num_input_tokens_seen": 7537088, "step": 13075 }, { "epoch": 1.9481680071492404, "grad_norm": 0.40464386343955994, "learning_rate": 4.8643586809740985e-05, "loss": 0.7848, "num_input_tokens_seen": 7539712, "step": 13080 }, { "epoch": 1.9489127196901996, "grad_norm": 0.5962561964988708, "learning_rate": 4.864147444408015e-05, "loss": 0.8352, "num_input_tokens_seen": 7542528, "step": 13085 }, { "epoch": 1.9496574322311586, "grad_norm": 0.34092679619789124, "learning_rate": 4.86393604808246e-05, "loss": 0.8027, "num_input_tokens_seen": 7545440, "step": 13090 }, { "epoch": 1.9504021447721178, "grad_norm": 0.35616859793663025, "learning_rate": 4.8637244920117175e-05, "loss": 0.8022, "num_input_tokens_seen": 7548192, "step": 13095 }, { "epoch": 1.951146857313077, "grad_norm": 0.3711029887199402, "learning_rate": 4.863512776210084e-05, "loss": 0.8268, "num_input_tokens_seen": 7550880, "step": 13100 }, { "epoch": 1.9518915698540362, "grad_norm": 0.34537866711616516, "learning_rate": 4.8633009006918665e-05, "loss": 0.8119, "num_input_tokens_seen": 7553888, "step": 13105 }, { "epoch": 1.9526362823949954, "grad_norm": 0.5078698396682739, "learning_rate": 4.863088865471382e-05, "loss": 0.8079, "num_input_tokens_seen": 7556896, "step": 13110 }, { "epoch": 1.9533809949359546, "grad_norm": 0.35797491669654846, "learning_rate": 4.8628766705629604e-05, "loss": 0.805, "num_input_tokens_seen": 7559648, "step": 13115 }, { "epoch": 1.9541257074769138, "grad_norm": 0.3840167224407196, "learning_rate": 4.862664315980939e-05, "loss": 0.8334, "num_input_tokens_seen": 7562560, "step": 13120 }, { "epoch": 1.954870420017873, "grad_norm": 0.4289124310016632, "learning_rate": 4.8624518017396706e-05, "loss": 0.8076, "num_input_tokens_seen": 7566080, "step": 13125 }, { "epoch": 1.9556151325588322, "grad_norm": 0.34202539920806885, "learning_rate": 4.862239127853514e-05, "loss": 0.8011, "num_input_tokens_seen": 7568800, "step": 13130 }, { "epoch": 1.9563598450997914, "grad_norm": 0.42662063241004944, "learning_rate": 4.8620262943368405e-05, "loss": 0.7707, "num_input_tokens_seen": 7571776, "step": 13135 }, { "epoch": 1.9571045576407506, "grad_norm": 0.3614419996738434, "learning_rate": 4.861813301204034e-05, "loss": 0.8036, "num_input_tokens_seen": 7574432, "step": 13140 }, { "epoch": 1.9578492701817098, "grad_norm": 0.4599630534648895, "learning_rate": 4.861600148469487e-05, "loss": 0.8019, "num_input_tokens_seen": 7577408, "step": 13145 }, { "epoch": 1.958593982722669, "grad_norm": 0.36805459856987, "learning_rate": 4.861386836147603e-05, "loss": 0.7883, "num_input_tokens_seen": 7580256, "step": 13150 }, { "epoch": 1.9593386952636282, "grad_norm": 0.2909265160560608, "learning_rate": 4.861173364252798e-05, "loss": 0.8307, "num_input_tokens_seen": 7583200, "step": 13155 }, { "epoch": 1.9600834078045875, "grad_norm": 0.35377228260040283, "learning_rate": 4.860959732799497e-05, "loss": 0.8331, "num_input_tokens_seen": 7585792, "step": 13160 }, { "epoch": 1.9608281203455467, "grad_norm": 0.3094537556171417, "learning_rate": 4.860745941802136e-05, "loss": 0.8229, "num_input_tokens_seen": 7588416, "step": 13165 }, { "epoch": 1.9615728328865059, "grad_norm": 0.435238778591156, "learning_rate": 4.860531991275162e-05, "loss": 0.7803, "num_input_tokens_seen": 7591392, "step": 13170 }, { "epoch": 1.962317545427465, "grad_norm": 0.4407745897769928, "learning_rate": 4.860317881233033e-05, "loss": 0.7954, "num_input_tokens_seen": 7594496, "step": 13175 }, { "epoch": 1.9630622579684243, "grad_norm": 0.33720624446868896, "learning_rate": 4.8601036116902184e-05, "loss": 0.797, "num_input_tokens_seen": 7597184, "step": 13180 }, { "epoch": 1.9638069705093835, "grad_norm": 0.4039870798587799, "learning_rate": 4.8598891826611974e-05, "loss": 0.8046, "num_input_tokens_seen": 7600096, "step": 13185 }, { "epoch": 1.9645516830503427, "grad_norm": 0.44372329115867615, "learning_rate": 4.85967459416046e-05, "loss": 0.7946, "num_input_tokens_seen": 7602976, "step": 13190 }, { "epoch": 1.9652963955913019, "grad_norm": 0.40711718797683716, "learning_rate": 4.859459846202507e-05, "loss": 0.7841, "num_input_tokens_seen": 7605600, "step": 13195 }, { "epoch": 1.966041108132261, "grad_norm": 0.35729512572288513, "learning_rate": 4.859244938801851e-05, "loss": 0.7806, "num_input_tokens_seen": 7608448, "step": 13200 }, { "epoch": 1.9667858206732203, "grad_norm": 0.3932299017906189, "learning_rate": 4.859029871973013e-05, "loss": 0.7888, "num_input_tokens_seen": 7611168, "step": 13205 }, { "epoch": 1.9675305332141795, "grad_norm": 0.43133682012557983, "learning_rate": 4.8588146457305284e-05, "loss": 0.7073, "num_input_tokens_seen": 7614144, "step": 13210 }, { "epoch": 1.9682752457551387, "grad_norm": 0.39218321442604065, "learning_rate": 4.85859926008894e-05, "loss": 0.7912, "num_input_tokens_seen": 7616896, "step": 13215 }, { "epoch": 1.9690199582960977, "grad_norm": 0.42186346650123596, "learning_rate": 4.858383715062803e-05, "loss": 0.8568, "num_input_tokens_seen": 7619904, "step": 13220 }, { "epoch": 1.9697646708370569, "grad_norm": 0.3962256610393524, "learning_rate": 4.8581680106666827e-05, "loss": 0.8268, "num_input_tokens_seen": 7622720, "step": 13225 }, { "epoch": 1.970509383378016, "grad_norm": 0.400816410779953, "learning_rate": 4.8579521469151555e-05, "loss": 0.7875, "num_input_tokens_seen": 7625792, "step": 13230 }, { "epoch": 1.9712540959189753, "grad_norm": 0.4863853454589844, "learning_rate": 4.85773612382281e-05, "loss": 0.8049, "num_input_tokens_seen": 7628896, "step": 13235 }, { "epoch": 1.9719988084599345, "grad_norm": 0.5381061434745789, "learning_rate": 4.857519941404242e-05, "loss": 0.8087, "num_input_tokens_seen": 7631776, "step": 13240 }, { "epoch": 1.9727435210008937, "grad_norm": 0.2789037525653839, "learning_rate": 4.8573035996740626e-05, "loss": 0.7914, "num_input_tokens_seen": 7634688, "step": 13245 }, { "epoch": 1.9734882335418529, "grad_norm": 0.3973037004470825, "learning_rate": 4.8570870986468886e-05, "loss": 0.7983, "num_input_tokens_seen": 7637696, "step": 13250 }, { "epoch": 1.974232946082812, "grad_norm": 0.534684419631958, "learning_rate": 4.856870438337353e-05, "loss": 0.7824, "num_input_tokens_seen": 7640384, "step": 13255 }, { "epoch": 1.974977658623771, "grad_norm": 0.4862162470817566, "learning_rate": 4.856653618760094e-05, "loss": 0.7619, "num_input_tokens_seen": 7643360, "step": 13260 }, { "epoch": 1.9757223711647303, "grad_norm": 0.6547532081604004, "learning_rate": 4.8564366399297666e-05, "loss": 0.8543, "num_input_tokens_seen": 7646208, "step": 13265 }, { "epoch": 1.9764670837056895, "grad_norm": 0.38083451986312866, "learning_rate": 4.856219501861031e-05, "loss": 0.7842, "num_input_tokens_seen": 7649024, "step": 13270 }, { "epoch": 1.9772117962466487, "grad_norm": 0.4850960075855255, "learning_rate": 4.8560022045685606e-05, "loss": 0.8401, "num_input_tokens_seen": 7652160, "step": 13275 }, { "epoch": 1.9779565087876079, "grad_norm": 0.4135059714317322, "learning_rate": 4.85578474806704e-05, "loss": 0.8084, "num_input_tokens_seen": 7655040, "step": 13280 }, { "epoch": 1.978701221328567, "grad_norm": 0.29192015528678894, "learning_rate": 4.8555671323711646e-05, "loss": 0.8257, "num_input_tokens_seen": 7657792, "step": 13285 }, { "epoch": 1.9794459338695263, "grad_norm": 0.41291531920433044, "learning_rate": 4.85534935749564e-05, "loss": 0.7843, "num_input_tokens_seen": 7660960, "step": 13290 }, { "epoch": 1.9801906464104855, "grad_norm": 0.4384375214576721, "learning_rate": 4.8551314234551814e-05, "loss": 0.8279, "num_input_tokens_seen": 7664000, "step": 13295 }, { "epoch": 1.9809353589514447, "grad_norm": 0.4233018159866333, "learning_rate": 4.854913330264516e-05, "loss": 0.7959, "num_input_tokens_seen": 7666912, "step": 13300 }, { "epoch": 1.9816800714924039, "grad_norm": 0.43492746353149414, "learning_rate": 4.8546950779383825e-05, "loss": 0.8793, "num_input_tokens_seen": 7669568, "step": 13305 }, { "epoch": 1.982424784033363, "grad_norm": 0.4528825283050537, "learning_rate": 4.854476666491529e-05, "loss": 0.7977, "num_input_tokens_seen": 7672608, "step": 13310 }, { "epoch": 1.9831694965743223, "grad_norm": 0.32233861088752747, "learning_rate": 4.854258095938715e-05, "loss": 0.8122, "num_input_tokens_seen": 7675232, "step": 13315 }, { "epoch": 1.9839142091152815, "grad_norm": 0.363020658493042, "learning_rate": 4.854039366294711e-05, "loss": 0.8018, "num_input_tokens_seen": 7678112, "step": 13320 }, { "epoch": 1.9846589216562407, "grad_norm": 0.5176764726638794, "learning_rate": 4.853820477574297e-05, "loss": 0.79, "num_input_tokens_seen": 7681152, "step": 13325 }, { "epoch": 1.9854036341972, "grad_norm": 0.355241984128952, "learning_rate": 4.853601429792265e-05, "loss": 0.8143, "num_input_tokens_seen": 7683808, "step": 13330 }, { "epoch": 1.986148346738159, "grad_norm": 0.40711677074432373, "learning_rate": 4.853382222963418e-05, "loss": 0.8138, "num_input_tokens_seen": 7686880, "step": 13335 }, { "epoch": 1.9868930592791183, "grad_norm": 0.4645216464996338, "learning_rate": 4.853162857102568e-05, "loss": 0.8196, "num_input_tokens_seen": 7689568, "step": 13340 }, { "epoch": 1.9876377718200775, "grad_norm": 0.41486185789108276, "learning_rate": 4.85294333222454e-05, "loss": 0.7978, "num_input_tokens_seen": 7692544, "step": 13345 }, { "epoch": 1.9883824843610367, "grad_norm": 0.41027289628982544, "learning_rate": 4.852723648344167e-05, "loss": 0.771, "num_input_tokens_seen": 7695392, "step": 13350 }, { "epoch": 1.989127196901996, "grad_norm": 0.38078784942626953, "learning_rate": 4.852503805476296e-05, "loss": 0.7765, "num_input_tokens_seen": 7698336, "step": 13355 }, { "epoch": 1.9898719094429551, "grad_norm": 0.42224615812301636, "learning_rate": 4.852283803635782e-05, "loss": 0.8109, "num_input_tokens_seen": 7701216, "step": 13360 }, { "epoch": 1.9906166219839143, "grad_norm": 0.3254557251930237, "learning_rate": 4.852063642837493e-05, "loss": 0.8266, "num_input_tokens_seen": 7704384, "step": 13365 }, { "epoch": 1.9913613345248735, "grad_norm": 0.5158149600028992, "learning_rate": 4.851843323096305e-05, "loss": 0.7945, "num_input_tokens_seen": 7707168, "step": 13370 }, { "epoch": 1.9921060470658327, "grad_norm": 0.3574809730052948, "learning_rate": 4.851622844427107e-05, "loss": 0.8367, "num_input_tokens_seen": 7709888, "step": 13375 }, { "epoch": 1.992850759606792, "grad_norm": 0.3798288404941559, "learning_rate": 4.851402206844799e-05, "loss": 0.824, "num_input_tokens_seen": 7712768, "step": 13380 }, { "epoch": 1.9935954721477511, "grad_norm": 0.24547836184501648, "learning_rate": 4.8511814103642894e-05, "loss": 0.7754, "num_input_tokens_seen": 7715680, "step": 13385 }, { "epoch": 1.99434018468871, "grad_norm": 0.3639111816883087, "learning_rate": 4.850960455000499e-05, "loss": 0.8151, "num_input_tokens_seen": 7718496, "step": 13390 }, { "epoch": 1.9950848972296693, "grad_norm": 0.43059197068214417, "learning_rate": 4.85073934076836e-05, "loss": 0.8508, "num_input_tokens_seen": 7721792, "step": 13395 }, { "epoch": 1.9958296097706285, "grad_norm": 0.4243305027484894, "learning_rate": 4.8505180676828144e-05, "loss": 0.7746, "num_input_tokens_seen": 7724800, "step": 13400 }, { "epoch": 1.9965743223115877, "grad_norm": 0.42604678869247437, "learning_rate": 4.850296635758813e-05, "loss": 0.8331, "num_input_tokens_seen": 7728096, "step": 13405 }, { "epoch": 1.997319034852547, "grad_norm": 0.5301812887191772, "learning_rate": 4.850075045011321e-05, "loss": 0.7808, "num_input_tokens_seen": 7731072, "step": 13410 }, { "epoch": 1.9980637473935061, "grad_norm": 0.4135911762714386, "learning_rate": 4.8498532954553125e-05, "loss": 0.8057, "num_input_tokens_seen": 7733952, "step": 13415 }, { "epoch": 1.9988084599344653, "grad_norm": 0.5180234313011169, "learning_rate": 4.8496313871057716e-05, "loss": 0.7869, "num_input_tokens_seen": 7737152, "step": 13420 }, { "epoch": 1.9995531724754245, "grad_norm": 0.4920101761817932, "learning_rate": 4.8494093199776944e-05, "loss": 0.8502, "num_input_tokens_seen": 7740128, "step": 13425 }, { "epoch": 2.0, "eval_loss": 0.8095512390136719, "eval_runtime": 45.4009, "eval_samples_per_second": 65.726, "eval_steps_per_second": 16.431, "num_input_tokens_seen": 7741288, "step": 13428 }, { "epoch": 2.0002978850163835, "grad_norm": 0.40612828731536865, "learning_rate": 4.849187094086088e-05, "loss": 0.8322, "num_input_tokens_seen": 7742568, "step": 13430 }, { "epoch": 2.0010425975573427, "grad_norm": 0.46767374873161316, "learning_rate": 4.848964709445969e-05, "loss": 0.7673, "num_input_tokens_seen": 7745416, "step": 13435 }, { "epoch": 2.001787310098302, "grad_norm": 0.4655180275440216, "learning_rate": 4.848742166072364e-05, "loss": 0.8035, "num_input_tokens_seen": 7748264, "step": 13440 }, { "epoch": 2.002532022639261, "grad_norm": 0.3434002995491028, "learning_rate": 4.8485194639803136e-05, "loss": 0.8221, "num_input_tokens_seen": 7751112, "step": 13445 }, { "epoch": 2.0032767351802203, "grad_norm": 0.343363881111145, "learning_rate": 4.848296603184866e-05, "loss": 0.7783, "num_input_tokens_seen": 7754024, "step": 13450 }, { "epoch": 2.0040214477211795, "grad_norm": 0.3559137284755707, "learning_rate": 4.848073583701081e-05, "loss": 0.7879, "num_input_tokens_seen": 7757064, "step": 13455 }, { "epoch": 2.0047661602621387, "grad_norm": 0.2752663791179657, "learning_rate": 4.847850405544031e-05, "loss": 0.7948, "num_input_tokens_seen": 7759816, "step": 13460 }, { "epoch": 2.005510872803098, "grad_norm": 0.38399219512939453, "learning_rate": 4.847627068728795e-05, "loss": 0.8575, "num_input_tokens_seen": 7763016, "step": 13465 }, { "epoch": 2.006255585344057, "grad_norm": 0.3219764232635498, "learning_rate": 4.847403573270467e-05, "loss": 0.7327, "num_input_tokens_seen": 7766024, "step": 13470 }, { "epoch": 2.0070002978850163, "grad_norm": 0.48546603322029114, "learning_rate": 4.847179919184149e-05, "loss": 0.8285, "num_input_tokens_seen": 7768712, "step": 13475 }, { "epoch": 2.0077450104259755, "grad_norm": 0.45091885328292847, "learning_rate": 4.8469561064849555e-05, "loss": 0.8339, "num_input_tokens_seen": 7771304, "step": 13480 }, { "epoch": 2.0084897229669347, "grad_norm": 0.3788488209247589, "learning_rate": 4.84673213518801e-05, "loss": 0.7935, "num_input_tokens_seen": 7774024, "step": 13485 }, { "epoch": 2.009234435507894, "grad_norm": 0.4429861307144165, "learning_rate": 4.846508005308448e-05, "loss": 0.8165, "num_input_tokens_seen": 7776936, "step": 13490 }, { "epoch": 2.009979148048853, "grad_norm": 0.4965892732143402, "learning_rate": 4.846283716861415e-05, "loss": 0.841, "num_input_tokens_seen": 7780040, "step": 13495 }, { "epoch": 2.0107238605898123, "grad_norm": 0.3507595658302307, "learning_rate": 4.8460592698620686e-05, "loss": 0.7905, "num_input_tokens_seen": 7782792, "step": 13500 }, { "epoch": 2.0114685731307715, "grad_norm": 0.42139947414398193, "learning_rate": 4.845834664325574e-05, "loss": 0.7995, "num_input_tokens_seen": 7785800, "step": 13505 }, { "epoch": 2.0122132856717307, "grad_norm": 0.41605862975120544, "learning_rate": 4.8456099002671104e-05, "loss": 0.8262, "num_input_tokens_seen": 7788392, "step": 13510 }, { "epoch": 2.01295799821269, "grad_norm": 0.35920146107673645, "learning_rate": 4.8453849777018675e-05, "loss": 0.8575, "num_input_tokens_seen": 7791432, "step": 13515 }, { "epoch": 2.013702710753649, "grad_norm": 0.5022938251495361, "learning_rate": 4.845159896645042e-05, "loss": 0.808, "num_input_tokens_seen": 7794376, "step": 13520 }, { "epoch": 2.0144474232946084, "grad_norm": 0.3411960005760193, "learning_rate": 4.844934657111846e-05, "loss": 0.831, "num_input_tokens_seen": 7797256, "step": 13525 }, { "epoch": 2.0151921358355676, "grad_norm": 0.42247968912124634, "learning_rate": 4.8447092591175e-05, "loss": 0.8097, "num_input_tokens_seen": 7799880, "step": 13530 }, { "epoch": 2.0159368483765268, "grad_norm": 0.3737667202949524, "learning_rate": 4.844483702677235e-05, "loss": 0.8048, "num_input_tokens_seen": 7802536, "step": 13535 }, { "epoch": 2.016681560917486, "grad_norm": 0.3806524872779846, "learning_rate": 4.8442579878062934e-05, "loss": 0.7861, "num_input_tokens_seen": 7805288, "step": 13540 }, { "epoch": 2.017426273458445, "grad_norm": 0.4096599519252777, "learning_rate": 4.844032114519928e-05, "loss": 0.7717, "num_input_tokens_seen": 7808200, "step": 13545 }, { "epoch": 2.0181709859994044, "grad_norm": 0.34019970893859863, "learning_rate": 4.8438060828334014e-05, "loss": 0.7923, "num_input_tokens_seen": 7811144, "step": 13550 }, { "epoch": 2.0189156985403636, "grad_norm": 0.4599688947200775, "learning_rate": 4.84357989276199e-05, "loss": 0.8431, "num_input_tokens_seen": 7814280, "step": 13555 }, { "epoch": 2.0196604110813228, "grad_norm": 0.3798283636569977, "learning_rate": 4.843353544320978e-05, "loss": 0.7868, "num_input_tokens_seen": 7817096, "step": 13560 }, { "epoch": 2.020405123622282, "grad_norm": 0.3202543258666992, "learning_rate": 4.84312703752566e-05, "loss": 0.8256, "num_input_tokens_seen": 7820008, "step": 13565 }, { "epoch": 2.021149836163241, "grad_norm": 0.3519999086856842, "learning_rate": 4.842900372391344e-05, "loss": 0.7957, "num_input_tokens_seen": 7822824, "step": 13570 }, { "epoch": 2.0218945487042004, "grad_norm": 0.3723328411579132, "learning_rate": 4.842673548933345e-05, "loss": 0.835, "num_input_tokens_seen": 7825480, "step": 13575 }, { "epoch": 2.0226392612451596, "grad_norm": 0.33393508195877075, "learning_rate": 4.8424465671669935e-05, "loss": 0.7961, "num_input_tokens_seen": 7828200, "step": 13580 }, { "epoch": 2.0233839737861183, "grad_norm": 0.4342215061187744, "learning_rate": 4.842219427107627e-05, "loss": 0.8564, "num_input_tokens_seen": 7831144, "step": 13585 }, { "epoch": 2.0241286863270775, "grad_norm": 0.2904374599456787, "learning_rate": 4.841992128770594e-05, "loss": 0.7697, "num_input_tokens_seen": 7833672, "step": 13590 }, { "epoch": 2.0248733988680367, "grad_norm": 0.42545148730278015, "learning_rate": 4.841764672171254e-05, "loss": 0.7872, "num_input_tokens_seen": 7836328, "step": 13595 }, { "epoch": 2.025618111408996, "grad_norm": 0.3776620328426361, "learning_rate": 4.841537057324979e-05, "loss": 0.8232, "num_input_tokens_seen": 7839080, "step": 13600 }, { "epoch": 2.026362823949955, "grad_norm": 0.4646444320678711, "learning_rate": 4.8413092842471496e-05, "loss": 0.8067, "num_input_tokens_seen": 7841768, "step": 13605 }, { "epoch": 2.0271075364909144, "grad_norm": 0.32593828439712524, "learning_rate": 4.841081352953158e-05, "loss": 0.7754, "num_input_tokens_seen": 7844872, "step": 13610 }, { "epoch": 2.0278522490318736, "grad_norm": 0.34776026010513306, "learning_rate": 4.8408532634584063e-05, "loss": 0.7741, "num_input_tokens_seen": 7847624, "step": 13615 }, { "epoch": 2.0285969615728328, "grad_norm": 0.4178732931613922, "learning_rate": 4.840625015778308e-05, "loss": 0.8229, "num_input_tokens_seen": 7850600, "step": 13620 }, { "epoch": 2.029341674113792, "grad_norm": 0.4467308819293976, "learning_rate": 4.8403966099282886e-05, "loss": 0.8148, "num_input_tokens_seen": 7853608, "step": 13625 }, { "epoch": 2.030086386654751, "grad_norm": 0.5862262845039368, "learning_rate": 4.840168045923781e-05, "loss": 0.8301, "num_input_tokens_seen": 7856872, "step": 13630 }, { "epoch": 2.0308310991957104, "grad_norm": 0.3625791370868683, "learning_rate": 4.8399393237802315e-05, "loss": 0.7952, "num_input_tokens_seen": 7860104, "step": 13635 }, { "epoch": 2.0315758117366696, "grad_norm": 0.48624593019485474, "learning_rate": 4.839710443513096e-05, "loss": 0.805, "num_input_tokens_seen": 7863144, "step": 13640 }, { "epoch": 2.0323205242776288, "grad_norm": 0.31676381826400757, "learning_rate": 4.8394814051378414e-05, "loss": 0.8207, "num_input_tokens_seen": 7866120, "step": 13645 }, { "epoch": 2.033065236818588, "grad_norm": 0.32596829533576965, "learning_rate": 4.839252208669944e-05, "loss": 0.8289, "num_input_tokens_seen": 7869160, "step": 13650 }, { "epoch": 2.033809949359547, "grad_norm": 0.36277174949645996, "learning_rate": 4.839022854124894e-05, "loss": 0.783, "num_input_tokens_seen": 7872136, "step": 13655 }, { "epoch": 2.0345546619005064, "grad_norm": 0.4155651330947876, "learning_rate": 4.838793341518189e-05, "loss": 0.7998, "num_input_tokens_seen": 7874888, "step": 13660 }, { "epoch": 2.0352993744414656, "grad_norm": 0.34354496002197266, "learning_rate": 4.838563670865339e-05, "loss": 0.7622, "num_input_tokens_seen": 7877896, "step": 13665 }, { "epoch": 2.036044086982425, "grad_norm": 0.46718311309814453, "learning_rate": 4.838333842181864e-05, "loss": 0.8132, "num_input_tokens_seen": 7880648, "step": 13670 }, { "epoch": 2.036788799523384, "grad_norm": 0.2754722535610199, "learning_rate": 4.838103855483295e-05, "loss": 0.7953, "num_input_tokens_seen": 7883432, "step": 13675 }, { "epoch": 2.037533512064343, "grad_norm": 0.5543937087059021, "learning_rate": 4.8378737107851736e-05, "loss": 0.8187, "num_input_tokens_seen": 7886280, "step": 13680 }, { "epoch": 2.0382782246053024, "grad_norm": 0.5011486411094666, "learning_rate": 4.837643408103051e-05, "loss": 0.8345, "num_input_tokens_seen": 7889128, "step": 13685 }, { "epoch": 2.0390229371462616, "grad_norm": 0.3291375935077667, "learning_rate": 4.837412947452492e-05, "loss": 0.7983, "num_input_tokens_seen": 7892200, "step": 13690 }, { "epoch": 2.039767649687221, "grad_norm": 0.40999504923820496, "learning_rate": 4.8371823288490694e-05, "loss": 0.8123, "num_input_tokens_seen": 7895112, "step": 13695 }, { "epoch": 2.04051236222818, "grad_norm": 0.4252467155456543, "learning_rate": 4.8369515523083664e-05, "loss": 0.7986, "num_input_tokens_seen": 7898056, "step": 13700 }, { "epoch": 2.041257074769139, "grad_norm": 0.40572643280029297, "learning_rate": 4.83672061784598e-05, "loss": 0.8225, "num_input_tokens_seen": 7900840, "step": 13705 }, { "epoch": 2.0420017873100984, "grad_norm": 0.3351109027862549, "learning_rate": 4.836489525477513e-05, "loss": 0.8239, "num_input_tokens_seen": 7903720, "step": 13710 }, { "epoch": 2.0427464998510576, "grad_norm": 0.3820059299468994, "learning_rate": 4.8362582752185844e-05, "loss": 0.8185, "num_input_tokens_seen": 7906984, "step": 13715 }, { "epoch": 2.043491212392017, "grad_norm": 0.3869287371635437, "learning_rate": 4.836026867084821e-05, "loss": 0.8205, "num_input_tokens_seen": 7909928, "step": 13720 }, { "epoch": 2.044235924932976, "grad_norm": 0.4235575199127197, "learning_rate": 4.8357953010918585e-05, "loss": 0.8325, "num_input_tokens_seen": 7913064, "step": 13725 }, { "epoch": 2.044980637473935, "grad_norm": 0.46067896485328674, "learning_rate": 4.835563577255346e-05, "loss": 0.7939, "num_input_tokens_seen": 7915784, "step": 13730 }, { "epoch": 2.0457253500148944, "grad_norm": 0.3439805805683136, "learning_rate": 4.835331695590943e-05, "loss": 0.8111, "num_input_tokens_seen": 7918664, "step": 13735 }, { "epoch": 2.0464700625558536, "grad_norm": 0.37058523297309875, "learning_rate": 4.835099656114319e-05, "loss": 0.7792, "num_input_tokens_seen": 7921704, "step": 13740 }, { "epoch": 2.047214775096813, "grad_norm": 0.3724251985549927, "learning_rate": 4.834867458841154e-05, "loss": 0.7891, "num_input_tokens_seen": 7924136, "step": 13745 }, { "epoch": 2.047959487637772, "grad_norm": 0.33321458101272583, "learning_rate": 4.8346351037871386e-05, "loss": 0.8206, "num_input_tokens_seen": 7927080, "step": 13750 }, { "epoch": 2.0487042001787312, "grad_norm": 0.3731391429901123, "learning_rate": 4.8344025909679746e-05, "loss": 0.7987, "num_input_tokens_seen": 7930120, "step": 13755 }, { "epoch": 2.04944891271969, "grad_norm": 0.4212731122970581, "learning_rate": 4.834169920399375e-05, "loss": 0.8591, "num_input_tokens_seen": 7932936, "step": 13760 }, { "epoch": 2.050193625260649, "grad_norm": 0.4542049765586853, "learning_rate": 4.8339370920970614e-05, "loss": 0.8284, "num_input_tokens_seen": 7935720, "step": 13765 }, { "epoch": 2.0509383378016084, "grad_norm": 0.46957531571388245, "learning_rate": 4.8337041060767696e-05, "loss": 0.7801, "num_input_tokens_seen": 7938760, "step": 13770 }, { "epoch": 2.0516830503425676, "grad_norm": 0.349727064371109, "learning_rate": 4.833470962354242e-05, "loss": 0.8104, "num_input_tokens_seen": 7941416, "step": 13775 }, { "epoch": 2.052427762883527, "grad_norm": 0.46155449748039246, "learning_rate": 4.8332376609452334e-05, "loss": 0.8519, "num_input_tokens_seen": 7945608, "step": 13780 }, { "epoch": 2.053172475424486, "grad_norm": 0.40019845962524414, "learning_rate": 4.83300420186551e-05, "loss": 0.8246, "num_input_tokens_seen": 7948360, "step": 13785 }, { "epoch": 2.053917187965445, "grad_norm": 0.4597957730293274, "learning_rate": 4.832770585130849e-05, "loss": 0.8015, "num_input_tokens_seen": 7951240, "step": 13790 }, { "epoch": 2.0546619005064044, "grad_norm": 0.3209855854511261, "learning_rate": 4.8325368107570354e-05, "loss": 0.8291, "num_input_tokens_seen": 7954088, "step": 13795 }, { "epoch": 2.0554066130473636, "grad_norm": 0.38516560196876526, "learning_rate": 4.8323028787598666e-05, "loss": 0.8268, "num_input_tokens_seen": 7956968, "step": 13800 }, { "epoch": 2.056151325588323, "grad_norm": 0.39182931184768677, "learning_rate": 4.832068789155153e-05, "loss": 0.7967, "num_input_tokens_seen": 7959816, "step": 13805 }, { "epoch": 2.056896038129282, "grad_norm": 0.42615005373954773, "learning_rate": 4.831834541958712e-05, "loss": 0.812, "num_input_tokens_seen": 7962792, "step": 13810 }, { "epoch": 2.057640750670241, "grad_norm": 0.3717752993106842, "learning_rate": 4.8316001371863726e-05, "loss": 0.8087, "num_input_tokens_seen": 7965608, "step": 13815 }, { "epoch": 2.0583854632112004, "grad_norm": 0.39928117394447327, "learning_rate": 4.831365574853977e-05, "loss": 0.8337, "num_input_tokens_seen": 7968424, "step": 13820 }, { "epoch": 2.0591301757521596, "grad_norm": 0.3418669104576111, "learning_rate": 4.831130854977373e-05, "loss": 0.7919, "num_input_tokens_seen": 7971496, "step": 13825 }, { "epoch": 2.059874888293119, "grad_norm": 0.34288290143013, "learning_rate": 4.830895977572424e-05, "loss": 0.7716, "num_input_tokens_seen": 7974152, "step": 13830 }, { "epoch": 2.060619600834078, "grad_norm": 0.5013452172279358, "learning_rate": 4.830660942655001e-05, "loss": 0.8432, "num_input_tokens_seen": 7977352, "step": 13835 }, { "epoch": 2.0613643133750372, "grad_norm": 0.341786652803421, "learning_rate": 4.8304257502409875e-05, "loss": 0.8364, "num_input_tokens_seen": 7979976, "step": 13840 }, { "epoch": 2.0621090259159964, "grad_norm": 0.4761490821838379, "learning_rate": 4.830190400346277e-05, "loss": 0.8293, "num_input_tokens_seen": 7982920, "step": 13845 }, { "epoch": 2.0628537384569556, "grad_norm": 0.4140370488166809, "learning_rate": 4.829954892986773e-05, "loss": 0.7951, "num_input_tokens_seen": 7985544, "step": 13850 }, { "epoch": 2.063598450997915, "grad_norm": 0.3440035581588745, "learning_rate": 4.829719228178391e-05, "loss": 0.793, "num_input_tokens_seen": 7988488, "step": 13855 }, { "epoch": 2.064343163538874, "grad_norm": 0.44544726610183716, "learning_rate": 4.829483405937054e-05, "loss": 0.816, "num_input_tokens_seen": 7991240, "step": 13860 }, { "epoch": 2.0650878760798332, "grad_norm": 0.3937302231788635, "learning_rate": 4.8292474262787e-05, "loss": 0.8045, "num_input_tokens_seen": 7994280, "step": 13865 }, { "epoch": 2.0658325886207924, "grad_norm": 0.48457518219947815, "learning_rate": 4.829011289219276e-05, "loss": 0.8259, "num_input_tokens_seen": 7997192, "step": 13870 }, { "epoch": 2.0665773011617516, "grad_norm": 0.521298885345459, "learning_rate": 4.828774994774737e-05, "loss": 0.8174, "num_input_tokens_seen": 8000072, "step": 13875 }, { "epoch": 2.067322013702711, "grad_norm": 0.3963097035884857, "learning_rate": 4.828538542961052e-05, "loss": 0.7757, "num_input_tokens_seen": 8003048, "step": 13880 }, { "epoch": 2.06806672624367, "grad_norm": 0.37534299492836, "learning_rate": 4.8283019337942e-05, "loss": 0.7997, "num_input_tokens_seen": 8005768, "step": 13885 }, { "epoch": 2.0688114387846293, "grad_norm": 0.4458513557910919, "learning_rate": 4.828065167290169e-05, "loss": 0.8307, "num_input_tokens_seen": 8008648, "step": 13890 }, { "epoch": 2.0695561513255885, "grad_norm": 0.27698326110839844, "learning_rate": 4.827828243464959e-05, "loss": 0.7941, "num_input_tokens_seen": 8011432, "step": 13895 }, { "epoch": 2.0703008638665477, "grad_norm": 0.33808186650276184, "learning_rate": 4.8275911623345816e-05, "loss": 0.7308, "num_input_tokens_seen": 8014696, "step": 13900 }, { "epoch": 2.071045576407507, "grad_norm": 0.43197861313819885, "learning_rate": 4.8273539239150555e-05, "loss": 0.7963, "num_input_tokens_seen": 8017576, "step": 13905 }, { "epoch": 2.071790288948466, "grad_norm": 0.3570777475833893, "learning_rate": 4.827116528222414e-05, "loss": 0.7986, "num_input_tokens_seen": 8020808, "step": 13910 }, { "epoch": 2.0725350014894253, "grad_norm": 0.5468475818634033, "learning_rate": 4.8268789752726993e-05, "loss": 0.8009, "num_input_tokens_seen": 8023528, "step": 13915 }, { "epoch": 2.0732797140303845, "grad_norm": 0.46764740347862244, "learning_rate": 4.826641265081964e-05, "loss": 0.8288, "num_input_tokens_seen": 8026632, "step": 13920 }, { "epoch": 2.0740244265713437, "grad_norm": 0.3248591125011444, "learning_rate": 4.82640339766627e-05, "loss": 0.7825, "num_input_tokens_seen": 8029352, "step": 13925 }, { "epoch": 2.074769139112303, "grad_norm": 0.34623652696609497, "learning_rate": 4.8261653730416945e-05, "loss": 0.7836, "num_input_tokens_seen": 8032424, "step": 13930 }, { "epoch": 2.0755138516532616, "grad_norm": 0.35615500807762146, "learning_rate": 4.8259271912243196e-05, "loss": 0.794, "num_input_tokens_seen": 8035080, "step": 13935 }, { "epoch": 2.076258564194221, "grad_norm": 0.43401941657066345, "learning_rate": 4.8256888522302426e-05, "loss": 0.8404, "num_input_tokens_seen": 8038120, "step": 13940 }, { "epoch": 2.07700327673518, "grad_norm": 0.4867802858352661, "learning_rate": 4.825450356075568e-05, "loss": 0.8168, "num_input_tokens_seen": 8041160, "step": 13945 }, { "epoch": 2.0777479892761392, "grad_norm": 0.4833724796772003, "learning_rate": 4.825211702776412e-05, "loss": 0.85, "num_input_tokens_seen": 8044104, "step": 13950 }, { "epoch": 2.0784927018170984, "grad_norm": 0.3815840184688568, "learning_rate": 4.824972892348904e-05, "loss": 0.8144, "num_input_tokens_seen": 8046728, "step": 13955 }, { "epoch": 2.0792374143580576, "grad_norm": 0.31175726652145386, "learning_rate": 4.8247339248091805e-05, "loss": 0.782, "num_input_tokens_seen": 8049704, "step": 13960 }, { "epoch": 2.079982126899017, "grad_norm": 0.38433346152305603, "learning_rate": 4.824494800173389e-05, "loss": 0.8013, "num_input_tokens_seen": 8052584, "step": 13965 }, { "epoch": 2.080726839439976, "grad_norm": 0.5622491240501404, "learning_rate": 4.824255518457691e-05, "loss": 0.782, "num_input_tokens_seen": 8055304, "step": 13970 }, { "epoch": 2.0814715519809353, "grad_norm": 0.3973809480667114, "learning_rate": 4.824016079678254e-05, "loss": 0.8323, "num_input_tokens_seen": 8058120, "step": 13975 }, { "epoch": 2.0822162645218945, "grad_norm": 0.28096726536750793, "learning_rate": 4.823776483851259e-05, "loss": 0.8107, "num_input_tokens_seen": 8061224, "step": 13980 }, { "epoch": 2.0829609770628537, "grad_norm": 0.34431830048561096, "learning_rate": 4.8235367309928975e-05, "loss": 0.8316, "num_input_tokens_seen": 8064136, "step": 13985 }, { "epoch": 2.083705689603813, "grad_norm": 0.3990403413772583, "learning_rate": 4.82329682111937e-05, "loss": 0.8033, "num_input_tokens_seen": 8066792, "step": 13990 }, { "epoch": 2.084450402144772, "grad_norm": 0.5625796318054199, "learning_rate": 4.82305675424689e-05, "loss": 0.8115, "num_input_tokens_seen": 8070024, "step": 13995 }, { "epoch": 2.0851951146857313, "grad_norm": 0.3447840213775635, "learning_rate": 4.822816530391678e-05, "loss": 0.7919, "num_input_tokens_seen": 8072680, "step": 14000 }, { "epoch": 2.0859398272266905, "grad_norm": 0.42051491141319275, "learning_rate": 4.82257614956997e-05, "loss": 0.7918, "num_input_tokens_seen": 8075848, "step": 14005 }, { "epoch": 2.0866845397676497, "grad_norm": 0.2927780747413635, "learning_rate": 4.8223356117980085e-05, "loss": 0.8241, "num_input_tokens_seen": 8079048, "step": 14010 }, { "epoch": 2.087429252308609, "grad_norm": 0.41071009635925293, "learning_rate": 4.822094917092048e-05, "loss": 0.8378, "num_input_tokens_seen": 8081832, "step": 14015 }, { "epoch": 2.088173964849568, "grad_norm": 0.43377918004989624, "learning_rate": 4.8218540654683544e-05, "loss": 0.8466, "num_input_tokens_seen": 8084584, "step": 14020 }, { "epoch": 2.0889186773905273, "grad_norm": 0.3440784513950348, "learning_rate": 4.821613056943203e-05, "loss": 0.8176, "num_input_tokens_seen": 8087464, "step": 14025 }, { "epoch": 2.0896633899314865, "grad_norm": 0.45843449234962463, "learning_rate": 4.821371891532879e-05, "loss": 0.8166, "num_input_tokens_seen": 8090376, "step": 14030 }, { "epoch": 2.0904081024724457, "grad_norm": 0.3492603600025177, "learning_rate": 4.821130569253682e-05, "loss": 0.8072, "num_input_tokens_seen": 8093192, "step": 14035 }, { "epoch": 2.091152815013405, "grad_norm": 0.338012158870697, "learning_rate": 4.8208890901219174e-05, "loss": 0.8028, "num_input_tokens_seen": 8096296, "step": 14040 }, { "epoch": 2.091897527554364, "grad_norm": 0.3170351982116699, "learning_rate": 4.820647454153905e-05, "loss": 0.7876, "num_input_tokens_seen": 8099144, "step": 14045 }, { "epoch": 2.0926422400953233, "grad_norm": 0.3589346706867218, "learning_rate": 4.820405661365972e-05, "loss": 0.817, "num_input_tokens_seen": 8101960, "step": 14050 }, { "epoch": 2.0933869526362825, "grad_norm": 0.4877576529979706, "learning_rate": 4.8201637117744584e-05, "loss": 0.7862, "num_input_tokens_seen": 8104712, "step": 14055 }, { "epoch": 2.0941316651772417, "grad_norm": 0.34605076909065247, "learning_rate": 4.819921605395714e-05, "loss": 0.8316, "num_input_tokens_seen": 8107368, "step": 14060 }, { "epoch": 2.094876377718201, "grad_norm": 0.31094780564308167, "learning_rate": 4.819679342246101e-05, "loss": 0.8184, "num_input_tokens_seen": 8110440, "step": 14065 }, { "epoch": 2.09562109025916, "grad_norm": 0.2946521043777466, "learning_rate": 4.819436922341988e-05, "loss": 0.7969, "num_input_tokens_seen": 8113448, "step": 14070 }, { "epoch": 2.0963658028001193, "grad_norm": 0.3830040991306305, "learning_rate": 4.819194345699758e-05, "loss": 0.8506, "num_input_tokens_seen": 8116104, "step": 14075 }, { "epoch": 2.0971105153410785, "grad_norm": 0.34509700536727905, "learning_rate": 4.818951612335803e-05, "loss": 0.7924, "num_input_tokens_seen": 8118920, "step": 14080 }, { "epoch": 2.0978552278820377, "grad_norm": 0.5190654397010803, "learning_rate": 4.8187087222665266e-05, "loss": 0.8297, "num_input_tokens_seen": 8121768, "step": 14085 }, { "epoch": 2.098599940422997, "grad_norm": 0.4036533534526825, "learning_rate": 4.818465675508342e-05, "loss": 0.8144, "num_input_tokens_seen": 8124968, "step": 14090 }, { "epoch": 2.099344652963956, "grad_norm": 0.3705919682979584, "learning_rate": 4.818222472077674e-05, "loss": 0.8043, "num_input_tokens_seen": 8127592, "step": 14095 }, { "epoch": 2.1000893655049153, "grad_norm": 0.302200049161911, "learning_rate": 4.817979111990955e-05, "loss": 0.7895, "num_input_tokens_seen": 8130152, "step": 14100 }, { "epoch": 2.1008340780458745, "grad_norm": 0.2932947278022766, "learning_rate": 4.817735595264633e-05, "loss": 0.8052, "num_input_tokens_seen": 8132968, "step": 14105 }, { "epoch": 2.1015787905868333, "grad_norm": 0.327491819858551, "learning_rate": 4.817491921915162e-05, "loss": 0.7966, "num_input_tokens_seen": 8135944, "step": 14110 }, { "epoch": 2.1023235031277925, "grad_norm": 0.3494863212108612, "learning_rate": 4.817248091959009e-05, "loss": 0.8167, "num_input_tokens_seen": 8138952, "step": 14115 }, { "epoch": 2.1030682156687517, "grad_norm": 0.4133553206920624, "learning_rate": 4.817004105412652e-05, "loss": 0.8069, "num_input_tokens_seen": 8141800, "step": 14120 }, { "epoch": 2.103812928209711, "grad_norm": 0.4754062592983246, "learning_rate": 4.8167599622925776e-05, "loss": 0.808, "num_input_tokens_seen": 8144552, "step": 14125 }, { "epoch": 2.10455764075067, "grad_norm": 0.6043684482574463, "learning_rate": 4.816515662615284e-05, "loss": 0.8487, "num_input_tokens_seen": 8147336, "step": 14130 }, { "epoch": 2.1053023532916293, "grad_norm": 0.3909417688846588, "learning_rate": 4.8162712063972805e-05, "loss": 0.8482, "num_input_tokens_seen": 8150152, "step": 14135 }, { "epoch": 2.1060470658325885, "grad_norm": 0.35975292325019836, "learning_rate": 4.816026593655085e-05, "loss": 0.8341, "num_input_tokens_seen": 8153128, "step": 14140 }, { "epoch": 2.1067917783735477, "grad_norm": 0.3918684720993042, "learning_rate": 4.81578182440523e-05, "loss": 0.8141, "num_input_tokens_seen": 8155848, "step": 14145 }, { "epoch": 2.107536490914507, "grad_norm": 0.39362862706184387, "learning_rate": 4.815536898664254e-05, "loss": 0.8123, "num_input_tokens_seen": 8158632, "step": 14150 }, { "epoch": 2.108281203455466, "grad_norm": 0.4571303427219391, "learning_rate": 4.815291816448709e-05, "loss": 0.8098, "num_input_tokens_seen": 8161480, "step": 14155 }, { "epoch": 2.1090259159964253, "grad_norm": 0.33612340688705444, "learning_rate": 4.815046577775156e-05, "loss": 0.8132, "num_input_tokens_seen": 8164424, "step": 14160 }, { "epoch": 2.1097706285373845, "grad_norm": 0.364601194858551, "learning_rate": 4.8148011826601676e-05, "loss": 0.7836, "num_input_tokens_seen": 8167464, "step": 14165 }, { "epoch": 2.1105153410783437, "grad_norm": 0.492721289396286, "learning_rate": 4.814555631120327e-05, "loss": 0.7818, "num_input_tokens_seen": 8170472, "step": 14170 }, { "epoch": 2.111260053619303, "grad_norm": 0.30162692070007324, "learning_rate": 4.814309923172227e-05, "loss": 0.7757, "num_input_tokens_seen": 8173480, "step": 14175 }, { "epoch": 2.112004766160262, "grad_norm": 0.3399356007575989, "learning_rate": 4.8140640588324705e-05, "loss": 0.8099, "num_input_tokens_seen": 8176232, "step": 14180 }, { "epoch": 2.1127494787012213, "grad_norm": 0.3757372796535492, "learning_rate": 4.8138180381176744e-05, "loss": 0.8618, "num_input_tokens_seen": 8179144, "step": 14185 }, { "epoch": 2.1134941912421805, "grad_norm": 0.45025119185447693, "learning_rate": 4.813571861044463e-05, "loss": 0.802, "num_input_tokens_seen": 8182056, "step": 14190 }, { "epoch": 2.1142389037831397, "grad_norm": 0.34687966108322144, "learning_rate": 4.81332552762947e-05, "loss": 0.8053, "num_input_tokens_seen": 8185128, "step": 14195 }, { "epoch": 2.114983616324099, "grad_norm": 0.4075911343097687, "learning_rate": 4.813079037889344e-05, "loss": 0.8146, "num_input_tokens_seen": 8188168, "step": 14200 }, { "epoch": 2.115728328865058, "grad_norm": 0.3598785698413849, "learning_rate": 4.812832391840741e-05, "loss": 0.7955, "num_input_tokens_seen": 8191112, "step": 14205 }, { "epoch": 2.1164730414060173, "grad_norm": 0.4366471767425537, "learning_rate": 4.812585589500328e-05, "loss": 0.8055, "num_input_tokens_seen": 8193960, "step": 14210 }, { "epoch": 2.1172177539469765, "grad_norm": 0.28670352697372437, "learning_rate": 4.812338630884783e-05, "loss": 0.7893, "num_input_tokens_seen": 8196904, "step": 14215 }, { "epoch": 2.1179624664879357, "grad_norm": 0.5244358777999878, "learning_rate": 4.812091516010795e-05, "loss": 0.8324, "num_input_tokens_seen": 8199656, "step": 14220 }, { "epoch": 2.118707179028895, "grad_norm": 0.4514823257923126, "learning_rate": 4.811844244895063e-05, "loss": 0.804, "num_input_tokens_seen": 8202568, "step": 14225 }, { "epoch": 2.119451891569854, "grad_norm": 0.3888646364212036, "learning_rate": 4.811596817554296e-05, "loss": 0.8034, "num_input_tokens_seen": 8205416, "step": 14230 }, { "epoch": 2.1201966041108133, "grad_norm": 0.3612498939037323, "learning_rate": 4.8113492340052135e-05, "loss": 0.7588, "num_input_tokens_seen": 8208168, "step": 14235 }, { "epoch": 2.1209413166517725, "grad_norm": 0.3139326274394989, "learning_rate": 4.8111014942645476e-05, "loss": 0.8318, "num_input_tokens_seen": 8210984, "step": 14240 }, { "epoch": 2.1216860291927317, "grad_norm": 0.381386935710907, "learning_rate": 4.8108535983490386e-05, "loss": 0.8055, "num_input_tokens_seen": 8213992, "step": 14245 }, { "epoch": 2.122430741733691, "grad_norm": 0.3751428723335266, "learning_rate": 4.8106055462754394e-05, "loss": 0.8166, "num_input_tokens_seen": 8217096, "step": 14250 }, { "epoch": 2.12317545427465, "grad_norm": 0.39854416251182556, "learning_rate": 4.810357338060512e-05, "loss": 0.7883, "num_input_tokens_seen": 8220040, "step": 14255 }, { "epoch": 2.1239201668156094, "grad_norm": 0.40870392322540283, "learning_rate": 4.810108973721028e-05, "loss": 0.794, "num_input_tokens_seen": 8222984, "step": 14260 }, { "epoch": 2.1246648793565686, "grad_norm": 0.41318079829216003, "learning_rate": 4.809860453273772e-05, "loss": 0.8379, "num_input_tokens_seen": 8225864, "step": 14265 }, { "epoch": 2.1254095918975278, "grad_norm": 0.44339269399642944, "learning_rate": 4.809611776735538e-05, "loss": 0.8125, "num_input_tokens_seen": 8228936, "step": 14270 }, { "epoch": 2.1261543044384865, "grad_norm": 0.346963495016098, "learning_rate": 4.809362944123129e-05, "loss": 0.8278, "num_input_tokens_seen": 8231592, "step": 14275 }, { "epoch": 2.126899016979446, "grad_norm": 0.4930209517478943, "learning_rate": 4.809113955453363e-05, "loss": 0.7811, "num_input_tokens_seen": 8234312, "step": 14280 }, { "epoch": 2.127643729520405, "grad_norm": 0.36866495013237, "learning_rate": 4.8088648107430636e-05, "loss": 0.8004, "num_input_tokens_seen": 8237096, "step": 14285 }, { "epoch": 2.128388442061364, "grad_norm": 0.5613756775856018, "learning_rate": 4.8086155100090676e-05, "loss": 0.8402, "num_input_tokens_seen": 8239848, "step": 14290 }, { "epoch": 2.1291331546023233, "grad_norm": 0.34938064217567444, "learning_rate": 4.8083660532682214e-05, "loss": 0.787, "num_input_tokens_seen": 8242664, "step": 14295 }, { "epoch": 2.1298778671432825, "grad_norm": 0.37698331475257874, "learning_rate": 4.8081164405373825e-05, "loss": 0.8025, "num_input_tokens_seen": 8245704, "step": 14300 }, { "epoch": 2.1306225796842417, "grad_norm": 0.3781338334083557, "learning_rate": 4.807866671833418e-05, "loss": 0.7961, "num_input_tokens_seen": 8248424, "step": 14305 }, { "epoch": 2.131367292225201, "grad_norm": 0.3952782452106476, "learning_rate": 4.807616747173208e-05, "loss": 0.825, "num_input_tokens_seen": 8251400, "step": 14310 }, { "epoch": 2.13211200476616, "grad_norm": 0.41968753933906555, "learning_rate": 4.8073666665736394e-05, "loss": 0.802, "num_input_tokens_seen": 8254472, "step": 14315 }, { "epoch": 2.1328567173071193, "grad_norm": 0.5877001881599426, "learning_rate": 4.807116430051614e-05, "loss": 0.8374, "num_input_tokens_seen": 8257256, "step": 14320 }, { "epoch": 2.1336014298480785, "grad_norm": 0.4306257665157318, "learning_rate": 4.806866037624039e-05, "loss": 0.7847, "num_input_tokens_seen": 8260392, "step": 14325 }, { "epoch": 2.1343461423890377, "grad_norm": 0.27682504057884216, "learning_rate": 4.806615489307836e-05, "loss": 0.7656, "num_input_tokens_seen": 8263208, "step": 14330 }, { "epoch": 2.135090854929997, "grad_norm": 0.4591606855392456, "learning_rate": 4.806364785119937e-05, "loss": 0.7981, "num_input_tokens_seen": 8265960, "step": 14335 }, { "epoch": 2.135835567470956, "grad_norm": 0.33634305000305176, "learning_rate": 4.8061139250772825e-05, "loss": 0.784, "num_input_tokens_seen": 8268872, "step": 14340 }, { "epoch": 2.1365802800119154, "grad_norm": 0.3526039123535156, "learning_rate": 4.805862909196825e-05, "loss": 0.8136, "num_input_tokens_seen": 8271624, "step": 14345 }, { "epoch": 2.1373249925528746, "grad_norm": 0.4886874258518219, "learning_rate": 4.805611737495527e-05, "loss": 0.8404, "num_input_tokens_seen": 8274536, "step": 14350 }, { "epoch": 2.1380697050938338, "grad_norm": 0.40657031536102295, "learning_rate": 4.8053604099903614e-05, "loss": 0.7899, "num_input_tokens_seen": 8277544, "step": 14355 }, { "epoch": 2.138814417634793, "grad_norm": 0.3771660327911377, "learning_rate": 4.8051089266983126e-05, "loss": 0.8549, "num_input_tokens_seen": 8280712, "step": 14360 }, { "epoch": 2.139559130175752, "grad_norm": 0.4090668261051178, "learning_rate": 4.804857287636375e-05, "loss": 0.7924, "num_input_tokens_seen": 8283752, "step": 14365 }, { "epoch": 2.1403038427167114, "grad_norm": 0.31973201036453247, "learning_rate": 4.804605492821552e-05, "loss": 0.8203, "num_input_tokens_seen": 8286536, "step": 14370 }, { "epoch": 2.1410485552576706, "grad_norm": 0.41295769810676575, "learning_rate": 4.80435354227086e-05, "loss": 0.7746, "num_input_tokens_seen": 8289512, "step": 14375 }, { "epoch": 2.1417932677986298, "grad_norm": 0.3895176351070404, "learning_rate": 4.8041014360013236e-05, "loss": 0.8015, "num_input_tokens_seen": 8292360, "step": 14380 }, { "epoch": 2.142537980339589, "grad_norm": 0.38226354122161865, "learning_rate": 4.803849174029981e-05, "loss": 0.7791, "num_input_tokens_seen": 8295048, "step": 14385 }, { "epoch": 2.143282692880548, "grad_norm": 0.32802218198776245, "learning_rate": 4.803596756373877e-05, "loss": 0.8157, "num_input_tokens_seen": 8298088, "step": 14390 }, { "epoch": 2.1440274054215074, "grad_norm": 0.4868684709072113, "learning_rate": 4.8033441830500706e-05, "loss": 0.833, "num_input_tokens_seen": 8301128, "step": 14395 }, { "epoch": 2.1447721179624666, "grad_norm": 0.3944070041179657, "learning_rate": 4.803091454075629e-05, "loss": 0.8365, "num_input_tokens_seen": 8304072, "step": 14400 }, { "epoch": 2.145516830503426, "grad_norm": 0.5317271947860718, "learning_rate": 4.8028385694676306e-05, "loss": 0.7765, "num_input_tokens_seen": 8307016, "step": 14405 }, { "epoch": 2.146261543044385, "grad_norm": 0.3747187554836273, "learning_rate": 4.802585529243164e-05, "loss": 0.8131, "num_input_tokens_seen": 8309416, "step": 14410 }, { "epoch": 2.147006255585344, "grad_norm": 0.2947392165660858, "learning_rate": 4.80233233341933e-05, "loss": 0.8424, "num_input_tokens_seen": 8312552, "step": 14415 }, { "epoch": 2.1477509681263034, "grad_norm": 0.4375716745853424, "learning_rate": 4.802078982013236e-05, "loss": 0.8038, "num_input_tokens_seen": 8315176, "step": 14420 }, { "epoch": 2.1484956806672626, "grad_norm": 0.29694437980651855, "learning_rate": 4.801825475042005e-05, "loss": 0.7951, "num_input_tokens_seen": 8318056, "step": 14425 }, { "epoch": 2.149240393208222, "grad_norm": 0.35236528515815735, "learning_rate": 4.801571812522767e-05, "loss": 0.7933, "num_input_tokens_seen": 8320840, "step": 14430 }, { "epoch": 2.149985105749181, "grad_norm": 0.31149429082870483, "learning_rate": 4.801317994472663e-05, "loss": 0.777, "num_input_tokens_seen": 8323688, "step": 14435 }, { "epoch": 2.15072981829014, "grad_norm": 0.36425361037254333, "learning_rate": 4.801064020908845e-05, "loss": 0.802, "num_input_tokens_seen": 8326664, "step": 14440 }, { "epoch": 2.1514745308310994, "grad_norm": 0.32278701663017273, "learning_rate": 4.800809891848477e-05, "loss": 0.7848, "num_input_tokens_seen": 8329384, "step": 14445 }, { "epoch": 2.152219243372058, "grad_norm": 0.3663560152053833, "learning_rate": 4.80055560730873e-05, "loss": 0.8081, "num_input_tokens_seen": 8332296, "step": 14450 }, { "epoch": 2.1529639559130174, "grad_norm": 0.35149946808815, "learning_rate": 4.800301167306789e-05, "loss": 0.8326, "num_input_tokens_seen": 8335272, "step": 14455 }, { "epoch": 2.1537086684539766, "grad_norm": 0.40398895740509033, "learning_rate": 4.800046571859847e-05, "loss": 0.8122, "num_input_tokens_seen": 8338344, "step": 14460 }, { "epoch": 2.1544533809949358, "grad_norm": 0.4736277461051941, "learning_rate": 4.79979182098511e-05, "loss": 0.8249, "num_input_tokens_seen": 8341192, "step": 14465 }, { "epoch": 2.155198093535895, "grad_norm": 0.44686684012413025, "learning_rate": 4.7995369146997906e-05, "loss": 0.7882, "num_input_tokens_seen": 8344200, "step": 14470 }, { "epoch": 2.155942806076854, "grad_norm": 0.27475178241729736, "learning_rate": 4.7992818530211164e-05, "loss": 0.821, "num_input_tokens_seen": 8347176, "step": 14475 }, { "epoch": 2.1566875186178134, "grad_norm": 0.3882451355457306, "learning_rate": 4.799026635966323e-05, "loss": 0.8165, "num_input_tokens_seen": 8350120, "step": 14480 }, { "epoch": 2.1574322311587726, "grad_norm": 0.31035688519477844, "learning_rate": 4.798771263552656e-05, "loss": 0.7982, "num_input_tokens_seen": 8353032, "step": 14485 }, { "epoch": 2.158176943699732, "grad_norm": 0.3338433504104614, "learning_rate": 4.798515735797374e-05, "loss": 0.804, "num_input_tokens_seen": 8356776, "step": 14490 }, { "epoch": 2.158921656240691, "grad_norm": 0.481047123670578, "learning_rate": 4.7982600527177427e-05, "loss": 0.812, "num_input_tokens_seen": 8359848, "step": 14495 }, { "epoch": 2.15966636878165, "grad_norm": 0.2915627658367157, "learning_rate": 4.798004214331042e-05, "loss": 0.8056, "num_input_tokens_seen": 8362536, "step": 14500 }, { "epoch": 2.1604110813226094, "grad_norm": 0.3556881248950958, "learning_rate": 4.7977482206545586e-05, "loss": 0.7988, "num_input_tokens_seen": 8365160, "step": 14505 }, { "epoch": 2.1611557938635686, "grad_norm": 0.39205503463745117, "learning_rate": 4.797492071705593e-05, "loss": 0.8032, "num_input_tokens_seen": 8368168, "step": 14510 }, { "epoch": 2.161900506404528, "grad_norm": 0.30708250403404236, "learning_rate": 4.7972357675014546e-05, "loss": 0.7609, "num_input_tokens_seen": 8370888, "step": 14515 }, { "epoch": 2.162645218945487, "grad_norm": 0.4452507495880127, "learning_rate": 4.796979308059462e-05, "loss": 0.8372, "num_input_tokens_seen": 8373480, "step": 14520 }, { "epoch": 2.163389931486446, "grad_norm": 0.42255866527557373, "learning_rate": 4.796722693396947e-05, "loss": 0.8171, "num_input_tokens_seen": 8376552, "step": 14525 }, { "epoch": 2.1641346440274054, "grad_norm": 0.368507444858551, "learning_rate": 4.79646592353125e-05, "loss": 0.7945, "num_input_tokens_seen": 8379464, "step": 14530 }, { "epoch": 2.1648793565683646, "grad_norm": 0.4363814890384674, "learning_rate": 4.7962089984797235e-05, "loss": 0.791, "num_input_tokens_seen": 8382440, "step": 14535 }, { "epoch": 2.165624069109324, "grad_norm": 0.3549022972583771, "learning_rate": 4.795951918259727e-05, "loss": 0.8104, "num_input_tokens_seen": 8385160, "step": 14540 }, { "epoch": 2.166368781650283, "grad_norm": 0.3426201641559601, "learning_rate": 4.795694682888635e-05, "loss": 0.8534, "num_input_tokens_seen": 8387848, "step": 14545 }, { "epoch": 2.167113494191242, "grad_norm": 0.4106147885322571, "learning_rate": 4.795437292383831e-05, "loss": 0.8286, "num_input_tokens_seen": 8390600, "step": 14550 }, { "epoch": 2.1678582067322014, "grad_norm": 0.3965390622615814, "learning_rate": 4.7951797467627065e-05, "loss": 0.8413, "num_input_tokens_seen": 8393480, "step": 14555 }, { "epoch": 2.1686029192731606, "grad_norm": 0.34345269203186035, "learning_rate": 4.7949220460426666e-05, "loss": 0.8141, "num_input_tokens_seen": 8396200, "step": 14560 }, { "epoch": 2.16934763181412, "grad_norm": 0.3592470586299896, "learning_rate": 4.794664190241125e-05, "loss": 0.7871, "num_input_tokens_seen": 8399208, "step": 14565 }, { "epoch": 2.170092344355079, "grad_norm": 0.3409104347229004, "learning_rate": 4.794406179375507e-05, "loss": 0.8261, "num_input_tokens_seen": 8401800, "step": 14570 }, { "epoch": 2.1708370568960382, "grad_norm": 0.26963189244270325, "learning_rate": 4.794148013463248e-05, "loss": 0.7865, "num_input_tokens_seen": 8404712, "step": 14575 }, { "epoch": 2.1715817694369974, "grad_norm": 0.32412075996398926, "learning_rate": 4.793889692521792e-05, "loss": 0.8069, "num_input_tokens_seen": 8407432, "step": 14580 }, { "epoch": 2.1723264819779566, "grad_norm": 0.37541481852531433, "learning_rate": 4.793631216568599e-05, "loss": 0.797, "num_input_tokens_seen": 8410312, "step": 14585 }, { "epoch": 2.173071194518916, "grad_norm": 0.42470502853393555, "learning_rate": 4.793372585621133e-05, "loss": 0.7749, "num_input_tokens_seen": 8413320, "step": 14590 }, { "epoch": 2.173815907059875, "grad_norm": 0.3359730541706085, "learning_rate": 4.793113799696871e-05, "loss": 0.8017, "num_input_tokens_seen": 8416168, "step": 14595 }, { "epoch": 2.1745606196008342, "grad_norm": 0.42205506563186646, "learning_rate": 4.792854858813303e-05, "loss": 0.8045, "num_input_tokens_seen": 8419240, "step": 14600 }, { "epoch": 2.1753053321417934, "grad_norm": 0.3675540089607239, "learning_rate": 4.792595762987924e-05, "loss": 0.8171, "num_input_tokens_seen": 8422440, "step": 14605 }, { "epoch": 2.1760500446827526, "grad_norm": 0.4634401798248291, "learning_rate": 4.792336512238246e-05, "loss": 0.8458, "num_input_tokens_seen": 8425032, "step": 14610 }, { "epoch": 2.176794757223712, "grad_norm": 0.43454650044441223, "learning_rate": 4.7920771065817846e-05, "loss": 0.8178, "num_input_tokens_seen": 8427816, "step": 14615 }, { "epoch": 2.177539469764671, "grad_norm": 0.3200374245643616, "learning_rate": 4.791817546036072e-05, "loss": 0.8001, "num_input_tokens_seen": 8430632, "step": 14620 }, { "epoch": 2.17828418230563, "grad_norm": 0.38694676756858826, "learning_rate": 4.7915578306186485e-05, "loss": 0.8424, "num_input_tokens_seen": 8433384, "step": 14625 }, { "epoch": 2.179028894846589, "grad_norm": 0.3061578571796417, "learning_rate": 4.791297960347063e-05, "loss": 0.7764, "num_input_tokens_seen": 8435976, "step": 14630 }, { "epoch": 2.179773607387548, "grad_norm": 0.4347259998321533, "learning_rate": 4.791037935238877e-05, "loss": 0.8142, "num_input_tokens_seen": 8438920, "step": 14635 }, { "epoch": 2.1805183199285074, "grad_norm": 0.3713735342025757, "learning_rate": 4.790777755311662e-05, "loss": 0.8069, "num_input_tokens_seen": 8441640, "step": 14640 }, { "epoch": 2.1812630324694666, "grad_norm": 0.41740527749061584, "learning_rate": 4.790517420583e-05, "loss": 0.7983, "num_input_tokens_seen": 8444488, "step": 14645 }, { "epoch": 2.182007745010426, "grad_norm": 0.3529587686061859, "learning_rate": 4.790256931070483e-05, "loss": 0.8225, "num_input_tokens_seen": 8447240, "step": 14650 }, { "epoch": 2.182752457551385, "grad_norm": 0.3695366680622101, "learning_rate": 4.789996286791715e-05, "loss": 0.8183, "num_input_tokens_seen": 8450088, "step": 14655 }, { "epoch": 2.1834971700923442, "grad_norm": 0.2774193584918976, "learning_rate": 4.789735487764307e-05, "loss": 0.7783, "num_input_tokens_seen": 8452776, "step": 14660 }, { "epoch": 2.1842418826333034, "grad_norm": 0.34960559010505676, "learning_rate": 4.789474534005885e-05, "loss": 0.8298, "num_input_tokens_seen": 8455656, "step": 14665 }, { "epoch": 2.1849865951742626, "grad_norm": 0.3066423237323761, "learning_rate": 4.789213425534082e-05, "loss": 0.7924, "num_input_tokens_seen": 8458312, "step": 14670 }, { "epoch": 2.185731307715222, "grad_norm": 0.34645017981529236, "learning_rate": 4.788952162366543e-05, "loss": 0.7572, "num_input_tokens_seen": 8461672, "step": 14675 }, { "epoch": 2.186476020256181, "grad_norm": 0.5215762257575989, "learning_rate": 4.7886907445209234e-05, "loss": 0.8483, "num_input_tokens_seen": 8464584, "step": 14680 }, { "epoch": 2.1872207327971402, "grad_norm": 0.35524702072143555, "learning_rate": 4.7884291720148876e-05, "loss": 0.8034, "num_input_tokens_seen": 8467336, "step": 14685 }, { "epoch": 2.1879654453380994, "grad_norm": 0.5086907148361206, "learning_rate": 4.7881674448661136e-05, "loss": 0.8461, "num_input_tokens_seen": 8470280, "step": 14690 }, { "epoch": 2.1887101578790586, "grad_norm": 0.2981397211551666, "learning_rate": 4.7879055630922856e-05, "loss": 0.7865, "num_input_tokens_seen": 8473384, "step": 14695 }, { "epoch": 2.189454870420018, "grad_norm": 0.4237830638885498, "learning_rate": 4.7876435267111024e-05, "loss": 0.833, "num_input_tokens_seen": 8476424, "step": 14700 }, { "epoch": 2.190199582960977, "grad_norm": 0.4800131618976593, "learning_rate": 4.7873813357402704e-05, "loss": 0.803, "num_input_tokens_seen": 8479176, "step": 14705 }, { "epoch": 2.1909442955019363, "grad_norm": 0.28280165791511536, "learning_rate": 4.7871189901975075e-05, "loss": 0.7728, "num_input_tokens_seen": 8482184, "step": 14710 }, { "epoch": 2.1916890080428955, "grad_norm": 0.40413087606430054, "learning_rate": 4.786856490100542e-05, "loss": 0.8368, "num_input_tokens_seen": 8485256, "step": 14715 }, { "epoch": 2.1924337205838547, "grad_norm": 0.44055238366127014, "learning_rate": 4.786593835467112e-05, "loss": 0.8375, "num_input_tokens_seen": 8488264, "step": 14720 }, { "epoch": 2.193178433124814, "grad_norm": 0.31279975175857544, "learning_rate": 4.786331026314968e-05, "loss": 0.7819, "num_input_tokens_seen": 8491144, "step": 14725 }, { "epoch": 2.193923145665773, "grad_norm": 0.4308384954929352, "learning_rate": 4.7860680626618684e-05, "loss": 0.8369, "num_input_tokens_seen": 8493960, "step": 14730 }, { "epoch": 2.1946678582067323, "grad_norm": 0.39711081981658936, "learning_rate": 4.7858049445255834e-05, "loss": 0.8381, "num_input_tokens_seen": 8496840, "step": 14735 }, { "epoch": 2.1954125707476915, "grad_norm": 0.36694154143333435, "learning_rate": 4.7855416719238945e-05, "loss": 0.7822, "num_input_tokens_seen": 8499592, "step": 14740 }, { "epoch": 2.1961572832886507, "grad_norm": 0.38337013125419617, "learning_rate": 4.78527824487459e-05, "loss": 0.8293, "num_input_tokens_seen": 8502632, "step": 14745 }, { "epoch": 2.19690199582961, "grad_norm": 0.40678781270980835, "learning_rate": 4.785014663395475e-05, "loss": 0.8022, "num_input_tokens_seen": 8505800, "step": 14750 }, { "epoch": 2.197646708370569, "grad_norm": 0.2585204243659973, "learning_rate": 4.784750927504358e-05, "loss": 0.8081, "num_input_tokens_seen": 8508552, "step": 14755 }, { "epoch": 2.1983914209115283, "grad_norm": 0.3892938494682312, "learning_rate": 4.784487037219063e-05, "loss": 0.8106, "num_input_tokens_seen": 8511464, "step": 14760 }, { "epoch": 2.1991361334524875, "grad_norm": 0.37765541672706604, "learning_rate": 4.784222992557422e-05, "loss": 0.821, "num_input_tokens_seen": 8513992, "step": 14765 }, { "epoch": 2.1998808459934467, "grad_norm": 0.3436562120914459, "learning_rate": 4.783958793537278e-05, "loss": 0.8238, "num_input_tokens_seen": 8516680, "step": 14770 }, { "epoch": 2.200625558534406, "grad_norm": 0.42639416456222534, "learning_rate": 4.783694440176485e-05, "loss": 0.8075, "num_input_tokens_seen": 8519624, "step": 14775 }, { "epoch": 2.201370271075365, "grad_norm": 0.4250396490097046, "learning_rate": 4.7834299324929056e-05, "loss": 0.8344, "num_input_tokens_seen": 8522536, "step": 14780 }, { "epoch": 2.2021149836163243, "grad_norm": 0.3127453029155731, "learning_rate": 4.7831652705044164e-05, "loss": 0.7873, "num_input_tokens_seen": 8525512, "step": 14785 }, { "epoch": 2.202859696157283, "grad_norm": 0.3831392228603363, "learning_rate": 4.7829004542289e-05, "loss": 0.8259, "num_input_tokens_seen": 8528392, "step": 14790 }, { "epoch": 2.2036044086982427, "grad_norm": 0.3777461349964142, "learning_rate": 4.7826354836842525e-05, "loss": 0.8139, "num_input_tokens_seen": 8531432, "step": 14795 }, { "epoch": 2.2043491212392015, "grad_norm": 0.44610071182250977, "learning_rate": 4.7823703588883796e-05, "loss": 0.7826, "num_input_tokens_seen": 8534440, "step": 14800 }, { "epoch": 2.2050938337801607, "grad_norm": 0.41725367307662964, "learning_rate": 4.782105079859198e-05, "loss": 0.8038, "num_input_tokens_seen": 8537384, "step": 14805 }, { "epoch": 2.20583854632112, "grad_norm": 0.21974727511405945, "learning_rate": 4.7818396466146326e-05, "loss": 0.8093, "num_input_tokens_seen": 8540200, "step": 14810 }, { "epoch": 2.206583258862079, "grad_norm": 0.42866015434265137, "learning_rate": 4.781574059172621e-05, "loss": 0.8154, "num_input_tokens_seen": 8542888, "step": 14815 }, { "epoch": 2.2073279714030383, "grad_norm": 0.3987097442150116, "learning_rate": 4.781308317551112e-05, "loss": 0.8121, "num_input_tokens_seen": 8545768, "step": 14820 }, { "epoch": 2.2080726839439975, "grad_norm": 0.46036890149116516, "learning_rate": 4.781042421768061e-05, "loss": 0.8098, "num_input_tokens_seen": 8548552, "step": 14825 }, { "epoch": 2.2088173964849567, "grad_norm": 0.3428369462490082, "learning_rate": 4.7807763718414374e-05, "loss": 0.8098, "num_input_tokens_seen": 8551432, "step": 14830 }, { "epoch": 2.209562109025916, "grad_norm": 0.38306429982185364, "learning_rate": 4.7805101677892194e-05, "loss": 0.8059, "num_input_tokens_seen": 8554408, "step": 14835 }, { "epoch": 2.210306821566875, "grad_norm": 0.34974992275238037, "learning_rate": 4.7802438096293964e-05, "loss": 0.8128, "num_input_tokens_seen": 8557128, "step": 14840 }, { "epoch": 2.2110515341078343, "grad_norm": 0.3984963893890381, "learning_rate": 4.7799772973799674e-05, "loss": 0.8171, "num_input_tokens_seen": 8560040, "step": 14845 }, { "epoch": 2.2117962466487935, "grad_norm": 0.33570268750190735, "learning_rate": 4.7797106310589424e-05, "loss": 0.8173, "num_input_tokens_seen": 8562824, "step": 14850 }, { "epoch": 2.2125409591897527, "grad_norm": 0.31624025106430054, "learning_rate": 4.779443810684341e-05, "loss": 0.8156, "num_input_tokens_seen": 8565960, "step": 14855 }, { "epoch": 2.213285671730712, "grad_norm": 0.4073953330516815, "learning_rate": 4.779176836274195e-05, "loss": 0.8115, "num_input_tokens_seen": 8568808, "step": 14860 }, { "epoch": 2.214030384271671, "grad_norm": 0.3744026720523834, "learning_rate": 4.778909707846545e-05, "loss": 0.7963, "num_input_tokens_seen": 8571496, "step": 14865 }, { "epoch": 2.2147750968126303, "grad_norm": 0.355959415435791, "learning_rate": 4.778642425419442e-05, "loss": 0.8197, "num_input_tokens_seen": 8574504, "step": 14870 }, { "epoch": 2.2155198093535895, "grad_norm": 0.3157121241092682, "learning_rate": 4.778374989010949e-05, "loss": 0.802, "num_input_tokens_seen": 8577256, "step": 14875 }, { "epoch": 2.2162645218945487, "grad_norm": 0.37391746044158936, "learning_rate": 4.778107398639136e-05, "loss": 0.8157, "num_input_tokens_seen": 8580200, "step": 14880 }, { "epoch": 2.217009234435508, "grad_norm": 0.26074549555778503, "learning_rate": 4.777839654322088e-05, "loss": 0.7941, "num_input_tokens_seen": 8583304, "step": 14885 }, { "epoch": 2.217753946976467, "grad_norm": 0.4246900975704193, "learning_rate": 4.777571756077897e-05, "loss": 0.8044, "num_input_tokens_seen": 8586248, "step": 14890 }, { "epoch": 2.2184986595174263, "grad_norm": 0.3321847915649414, "learning_rate": 4.777303703924667e-05, "loss": 0.8352, "num_input_tokens_seen": 8588968, "step": 14895 }, { "epoch": 2.2192433720583855, "grad_norm": 0.3721068203449249, "learning_rate": 4.777035497880511e-05, "loss": 0.8253, "num_input_tokens_seen": 8592008, "step": 14900 }, { "epoch": 2.2199880845993447, "grad_norm": 0.40803948044776917, "learning_rate": 4.776767137963554e-05, "loss": 0.8192, "num_input_tokens_seen": 8595144, "step": 14905 }, { "epoch": 2.220732797140304, "grad_norm": 0.36098867654800415, "learning_rate": 4.776498624191931e-05, "loss": 0.8044, "num_input_tokens_seen": 8598408, "step": 14910 }, { "epoch": 2.221477509681263, "grad_norm": 0.36468151211738586, "learning_rate": 4.7762299565837855e-05, "loss": 0.7737, "num_input_tokens_seen": 8601352, "step": 14915 }, { "epoch": 2.2222222222222223, "grad_norm": 0.32007282972335815, "learning_rate": 4.775961135157275e-05, "loss": 0.818, "num_input_tokens_seen": 8604456, "step": 14920 }, { "epoch": 2.2229669347631815, "grad_norm": 0.44696781039237976, "learning_rate": 4.775692159930564e-05, "loss": 0.8204, "num_input_tokens_seen": 8607560, "step": 14925 }, { "epoch": 2.2237116473041407, "grad_norm": 0.35534870624542236, "learning_rate": 4.775423030921828e-05, "loss": 0.8063, "num_input_tokens_seen": 8610440, "step": 14930 }, { "epoch": 2.2244563598451, "grad_norm": 0.40866518020629883, "learning_rate": 4.7751537481492565e-05, "loss": 0.7902, "num_input_tokens_seen": 8613384, "step": 14935 }, { "epoch": 2.225201072386059, "grad_norm": 0.3642534017562866, "learning_rate": 4.7748843116310434e-05, "loss": 0.84, "num_input_tokens_seen": 8616296, "step": 14940 }, { "epoch": 2.2259457849270183, "grad_norm": 0.338657408952713, "learning_rate": 4.774614721385399e-05, "loss": 0.7729, "num_input_tokens_seen": 8619368, "step": 14945 }, { "epoch": 2.2266904974679775, "grad_norm": 0.35217219591140747, "learning_rate": 4.7743449774305386e-05, "loss": 0.8454, "num_input_tokens_seen": 8622248, "step": 14950 }, { "epoch": 2.2274352100089367, "grad_norm": 0.3994382619857788, "learning_rate": 4.774075079784692e-05, "loss": 0.8195, "num_input_tokens_seen": 8625128, "step": 14955 }, { "epoch": 2.228179922549896, "grad_norm": 0.41018080711364746, "learning_rate": 4.7738050284660966e-05, "loss": 0.8477, "num_input_tokens_seen": 8628168, "step": 14960 }, { "epoch": 2.2289246350908547, "grad_norm": 0.4142216444015503, "learning_rate": 4.7735348234930024e-05, "loss": 0.8044, "num_input_tokens_seen": 8631208, "step": 14965 }, { "epoch": 2.2296693476318143, "grad_norm": 0.348985880613327, "learning_rate": 4.773264464883669e-05, "loss": 0.8118, "num_input_tokens_seen": 8634152, "step": 14970 }, { "epoch": 2.230414060172773, "grad_norm": 0.2947421371936798, "learning_rate": 4.772993952656364e-05, "loss": 0.7574, "num_input_tokens_seen": 8637384, "step": 14975 }, { "epoch": 2.2311587727137323, "grad_norm": 0.2623814344406128, "learning_rate": 4.7727232868293705e-05, "loss": 0.7834, "num_input_tokens_seen": 8640424, "step": 14980 }, { "epoch": 2.2319034852546915, "grad_norm": 0.22233721613883972, "learning_rate": 4.7724524674209765e-05, "loss": 0.7864, "num_input_tokens_seen": 8643240, "step": 14985 }, { "epoch": 2.2326481977956507, "grad_norm": 0.5257058143615723, "learning_rate": 4.7721814944494834e-05, "loss": 0.8005, "num_input_tokens_seen": 8646024, "step": 14990 }, { "epoch": 2.23339291033661, "grad_norm": 0.5198317766189575, "learning_rate": 4.771910367933204e-05, "loss": 0.8231, "num_input_tokens_seen": 8648840, "step": 14995 }, { "epoch": 2.234137622877569, "grad_norm": 0.3070354461669922, "learning_rate": 4.771639087890459e-05, "loss": 0.7755, "num_input_tokens_seen": 8651688, "step": 15000 }, { "epoch": 2.2348823354185283, "grad_norm": 0.36359527707099915, "learning_rate": 4.771367654339579e-05, "loss": 0.7915, "num_input_tokens_seen": 8654344, "step": 15005 }, { "epoch": 2.2356270479594875, "grad_norm": 0.3946087956428528, "learning_rate": 4.771096067298909e-05, "loss": 0.8006, "num_input_tokens_seen": 8657128, "step": 15010 }, { "epoch": 2.2363717605004467, "grad_norm": 0.28945356607437134, "learning_rate": 4.7708243267868e-05, "loss": 0.7639, "num_input_tokens_seen": 8659912, "step": 15015 }, { "epoch": 2.237116473041406, "grad_norm": 0.5179906487464905, "learning_rate": 4.770552432821615e-05, "loss": 0.8363, "num_input_tokens_seen": 8663016, "step": 15020 }, { "epoch": 2.237861185582365, "grad_norm": 0.4678988754749298, "learning_rate": 4.770280385421728e-05, "loss": 0.8127, "num_input_tokens_seen": 8666280, "step": 15025 }, { "epoch": 2.2386058981233243, "grad_norm": 0.3264045715332031, "learning_rate": 4.7700081846055236e-05, "loss": 0.8514, "num_input_tokens_seen": 8669224, "step": 15030 }, { "epoch": 2.2393506106642835, "grad_norm": 0.3710739016532898, "learning_rate": 4.7697358303913944e-05, "loss": 0.7987, "num_input_tokens_seen": 8671912, "step": 15035 }, { "epoch": 2.2400953232052427, "grad_norm": 0.32021328806877136, "learning_rate": 4.769463322797746e-05, "loss": 0.8199, "num_input_tokens_seen": 8674984, "step": 15040 }, { "epoch": 2.240840035746202, "grad_norm": 0.43821823596954346, "learning_rate": 4.7691906618429935e-05, "loss": 0.83, "num_input_tokens_seen": 8677896, "step": 15045 }, { "epoch": 2.241584748287161, "grad_norm": 0.39485034346580505, "learning_rate": 4.768917847545562e-05, "loss": 0.8478, "num_input_tokens_seen": 8680872, "step": 15050 }, { "epoch": 2.2423294608281203, "grad_norm": 0.466264009475708, "learning_rate": 4.768644879923887e-05, "loss": 0.8109, "num_input_tokens_seen": 8683560, "step": 15055 }, { "epoch": 2.2430741733690795, "grad_norm": 0.30319949984550476, "learning_rate": 4.768371758996415e-05, "loss": 0.803, "num_input_tokens_seen": 8686408, "step": 15060 }, { "epoch": 2.2438188859100388, "grad_norm": 0.26778391003608704, "learning_rate": 4.7680984847816015e-05, "loss": 0.8154, "num_input_tokens_seen": 8689320, "step": 15065 }, { "epoch": 2.244563598450998, "grad_norm": 0.34242409467697144, "learning_rate": 4.767825057297914e-05, "loss": 0.8101, "num_input_tokens_seen": 8692584, "step": 15070 }, { "epoch": 2.245308310991957, "grad_norm": 0.3099971413612366, "learning_rate": 4.767551476563829e-05, "loss": 0.8166, "num_input_tokens_seen": 8695432, "step": 15075 }, { "epoch": 2.2460530235329164, "grad_norm": 0.3584335148334503, "learning_rate": 4.767277742597835e-05, "loss": 0.7991, "num_input_tokens_seen": 8698152, "step": 15080 }, { "epoch": 2.2467977360738756, "grad_norm": 0.3285364508628845, "learning_rate": 4.7670038554184296e-05, "loss": 0.7881, "num_input_tokens_seen": 8700840, "step": 15085 }, { "epoch": 2.2475424486148348, "grad_norm": 0.38257795572280884, "learning_rate": 4.7667298150441194e-05, "loss": 0.8173, "num_input_tokens_seen": 8703912, "step": 15090 }, { "epoch": 2.248287161155794, "grad_norm": 0.3672109842300415, "learning_rate": 4.7664556214934255e-05, "loss": 0.8032, "num_input_tokens_seen": 8706888, "step": 15095 }, { "epoch": 2.249031873696753, "grad_norm": 0.3833599090576172, "learning_rate": 4.766181274784874e-05, "loss": 0.7767, "num_input_tokens_seen": 8709800, "step": 15100 }, { "epoch": 2.2497765862377124, "grad_norm": 0.375387579202652, "learning_rate": 4.765906774937007e-05, "loss": 0.8385, "num_input_tokens_seen": 8712616, "step": 15105 }, { "epoch": 2.2505212987786716, "grad_norm": 0.37498438358306885, "learning_rate": 4.765632121968371e-05, "loss": 0.8306, "num_input_tokens_seen": 8715464, "step": 15110 }, { "epoch": 2.2512660113196308, "grad_norm": 0.3769121766090393, "learning_rate": 4.765357315897529e-05, "loss": 0.7959, "num_input_tokens_seen": 8718888, "step": 15115 }, { "epoch": 2.25201072386059, "grad_norm": 0.29862481355667114, "learning_rate": 4.765082356743049e-05, "loss": 0.8201, "num_input_tokens_seen": 8721512, "step": 15120 }, { "epoch": 2.252755436401549, "grad_norm": 0.3784240186214447, "learning_rate": 4.7648072445235136e-05, "loss": 0.8002, "num_input_tokens_seen": 8724584, "step": 15125 }, { "epoch": 2.2535001489425084, "grad_norm": 0.4168369472026825, "learning_rate": 4.764531979257511e-05, "loss": 0.8399, "num_input_tokens_seen": 8727880, "step": 15130 }, { "epoch": 2.2542448614834676, "grad_norm": 0.36833760142326355, "learning_rate": 4.764256560963646e-05, "loss": 0.8138, "num_input_tokens_seen": 8731016, "step": 15135 }, { "epoch": 2.2549895740244263, "grad_norm": 0.36482667922973633, "learning_rate": 4.7639809896605275e-05, "loss": 0.8055, "num_input_tokens_seen": 8733896, "step": 15140 }, { "epoch": 2.255734286565386, "grad_norm": 0.3060387372970581, "learning_rate": 4.763705265366779e-05, "loss": 0.8345, "num_input_tokens_seen": 8736776, "step": 15145 }, { "epoch": 2.2564789991063448, "grad_norm": 0.4133160412311554, "learning_rate": 4.763429388101031e-05, "loss": 0.8141, "num_input_tokens_seen": 8739816, "step": 15150 }, { "epoch": 2.257223711647304, "grad_norm": 0.3375813961029053, "learning_rate": 4.763153357881928e-05, "loss": 0.8263, "num_input_tokens_seen": 8742984, "step": 15155 }, { "epoch": 2.257968424188263, "grad_norm": 0.3229716718196869, "learning_rate": 4.7628771747281226e-05, "loss": 0.7959, "num_input_tokens_seen": 8746088, "step": 15160 }, { "epoch": 2.2587131367292224, "grad_norm": 0.4324171841144562, "learning_rate": 4.762600838658278e-05, "loss": 0.7842, "num_input_tokens_seen": 8748968, "step": 15165 }, { "epoch": 2.2594578492701816, "grad_norm": 0.26970410346984863, "learning_rate": 4.762324349691067e-05, "loss": 0.7724, "num_input_tokens_seen": 8751848, "step": 15170 }, { "epoch": 2.2602025618111408, "grad_norm": 0.48207366466522217, "learning_rate": 4.762047707845175e-05, "loss": 0.8466, "num_input_tokens_seen": 8754536, "step": 15175 }, { "epoch": 2.2609472743521, "grad_norm": 0.31496790051460266, "learning_rate": 4.761770913139296e-05, "loss": 0.7898, "num_input_tokens_seen": 8757256, "step": 15180 }, { "epoch": 2.261691986893059, "grad_norm": 0.3789597451686859, "learning_rate": 4.761493965592134e-05, "loss": 0.7986, "num_input_tokens_seen": 8760104, "step": 15185 }, { "epoch": 2.2624366994340184, "grad_norm": 0.40664196014404297, "learning_rate": 4.761216865222404e-05, "loss": 0.8332, "num_input_tokens_seen": 8763112, "step": 15190 }, { "epoch": 2.2631814119749776, "grad_norm": 0.3225117325782776, "learning_rate": 4.7609396120488324e-05, "loss": 0.8282, "num_input_tokens_seen": 8766248, "step": 15195 }, { "epoch": 2.2639261245159368, "grad_norm": 0.49570369720458984, "learning_rate": 4.760662206090154e-05, "loss": 0.8014, "num_input_tokens_seen": 8768936, "step": 15200 }, { "epoch": 2.264670837056896, "grad_norm": 0.40382999181747437, "learning_rate": 4.7603846473651147e-05, "loss": 0.7979, "num_input_tokens_seen": 8771656, "step": 15205 }, { "epoch": 2.265415549597855, "grad_norm": 0.34094634652137756, "learning_rate": 4.760106935892471e-05, "loss": 0.7947, "num_input_tokens_seen": 8774472, "step": 15210 }, { "epoch": 2.2661602621388144, "grad_norm": 0.4436044692993164, "learning_rate": 4.7598290716909897e-05, "loss": 0.8123, "num_input_tokens_seen": 8777224, "step": 15215 }, { "epoch": 2.2669049746797736, "grad_norm": 0.4093318581581116, "learning_rate": 4.7595510547794465e-05, "loss": 0.8265, "num_input_tokens_seen": 8780296, "step": 15220 }, { "epoch": 2.267649687220733, "grad_norm": 0.3104349672794342, "learning_rate": 4.759272885176631e-05, "loss": 0.7988, "num_input_tokens_seen": 8783240, "step": 15225 }, { "epoch": 2.268394399761692, "grad_norm": 0.4298892617225647, "learning_rate": 4.758994562901339e-05, "loss": 0.8295, "num_input_tokens_seen": 8786216, "step": 15230 }, { "epoch": 2.269139112302651, "grad_norm": 0.34308815002441406, "learning_rate": 4.7587160879723785e-05, "loss": 0.8123, "num_input_tokens_seen": 8789192, "step": 15235 }, { "epoch": 2.2698838248436104, "grad_norm": 0.35103514790534973, "learning_rate": 4.7584374604085684e-05, "loss": 0.7984, "num_input_tokens_seen": 8792040, "step": 15240 }, { "epoch": 2.2706285373845696, "grad_norm": 0.30861222743988037, "learning_rate": 4.758158680228737e-05, "loss": 0.7982, "num_input_tokens_seen": 8794664, "step": 15245 }, { "epoch": 2.271373249925529, "grad_norm": 0.3119606375694275, "learning_rate": 4.757879747451722e-05, "loss": 0.7926, "num_input_tokens_seen": 8797704, "step": 15250 }, { "epoch": 2.272117962466488, "grad_norm": 0.24958181381225586, "learning_rate": 4.757600662096375e-05, "loss": 0.814, "num_input_tokens_seen": 8800872, "step": 15255 }, { "epoch": 2.272862675007447, "grad_norm": 0.27303868532180786, "learning_rate": 4.757321424181553e-05, "loss": 0.7845, "num_input_tokens_seen": 8803688, "step": 15260 }, { "epoch": 2.2736073875484064, "grad_norm": 0.416533499956131, "learning_rate": 4.7570420337261275e-05, "loss": 0.8004, "num_input_tokens_seen": 8806504, "step": 15265 }, { "epoch": 2.2743521000893656, "grad_norm": 0.45423370599746704, "learning_rate": 4.756762490748977e-05, "loss": 0.8787, "num_input_tokens_seen": 8809544, "step": 15270 }, { "epoch": 2.275096812630325, "grad_norm": 0.2931244373321533, "learning_rate": 4.7564827952689936e-05, "loss": 0.8114, "num_input_tokens_seen": 8812520, "step": 15275 }, { "epoch": 2.275841525171284, "grad_norm": 0.35823991894721985, "learning_rate": 4.756202947305076e-05, "loss": 0.8077, "num_input_tokens_seen": 8815304, "step": 15280 }, { "epoch": 2.276586237712243, "grad_norm": 0.3669818639755249, "learning_rate": 4.755922946876137e-05, "loss": 0.8033, "num_input_tokens_seen": 8818280, "step": 15285 }, { "epoch": 2.2773309502532024, "grad_norm": 0.41392937302589417, "learning_rate": 4.755642794001097e-05, "loss": 0.8291, "num_input_tokens_seen": 8820968, "step": 15290 }, { "epoch": 2.2780756627941616, "grad_norm": 0.4548279047012329, "learning_rate": 4.755362488698888e-05, "loss": 0.787, "num_input_tokens_seen": 8823976, "step": 15295 }, { "epoch": 2.278820375335121, "grad_norm": 0.6091756224632263, "learning_rate": 4.755082030988452e-05, "loss": 0.9047, "num_input_tokens_seen": 8826664, "step": 15300 }, { "epoch": 2.2795650878760796, "grad_norm": 0.3575078248977661, "learning_rate": 4.7548014208887396e-05, "loss": 0.7756, "num_input_tokens_seen": 8829928, "step": 15305 }, { "epoch": 2.2803098004170392, "grad_norm": 0.374822735786438, "learning_rate": 4.7545206584187155e-05, "loss": 0.8204, "num_input_tokens_seen": 8832712, "step": 15310 }, { "epoch": 2.281054512957998, "grad_norm": 0.35321125388145447, "learning_rate": 4.7542397435973515e-05, "loss": 0.8055, "num_input_tokens_seen": 8836200, "step": 15315 }, { "epoch": 2.2817992254989576, "grad_norm": 0.28649401664733887, "learning_rate": 4.75395867644363e-05, "loss": 0.8113, "num_input_tokens_seen": 8839080, "step": 15320 }, { "epoch": 2.2825439380399164, "grad_norm": 0.41124093532562256, "learning_rate": 4.753677456976546e-05, "loss": 0.8283, "num_input_tokens_seen": 8841960, "step": 15325 }, { "epoch": 2.2832886505808756, "grad_norm": 0.3553503155708313, "learning_rate": 4.753396085215102e-05, "loss": 0.8055, "num_input_tokens_seen": 8844584, "step": 15330 }, { "epoch": 2.284033363121835, "grad_norm": 0.31394606828689575, "learning_rate": 4.753114561178311e-05, "loss": 0.7851, "num_input_tokens_seen": 8847400, "step": 15335 }, { "epoch": 2.284778075662794, "grad_norm": 0.24963605403900146, "learning_rate": 4.7528328848852e-05, "loss": 0.8116, "num_input_tokens_seen": 8850120, "step": 15340 }, { "epoch": 2.285522788203753, "grad_norm": 0.3121509552001953, "learning_rate": 4.752551056354801e-05, "loss": 0.7942, "num_input_tokens_seen": 8852936, "step": 15345 }, { "epoch": 2.2862675007447124, "grad_norm": 0.32017216086387634, "learning_rate": 4.7522690756061606e-05, "loss": 0.8093, "num_input_tokens_seen": 8856104, "step": 15350 }, { "epoch": 2.2870122132856716, "grad_norm": 0.4422784745693207, "learning_rate": 4.751986942658332e-05, "loss": 0.8001, "num_input_tokens_seen": 8859368, "step": 15355 }, { "epoch": 2.287756925826631, "grad_norm": 0.30778369307518005, "learning_rate": 4.751704657530383e-05, "loss": 0.8069, "num_input_tokens_seen": 8862024, "step": 15360 }, { "epoch": 2.28850163836759, "grad_norm": 0.32762494683265686, "learning_rate": 4.751422220241387e-05, "loss": 0.8029, "num_input_tokens_seen": 8864872, "step": 15365 }, { "epoch": 2.289246350908549, "grad_norm": 0.3535151779651642, "learning_rate": 4.7511396308104314e-05, "loss": 0.7945, "num_input_tokens_seen": 8867752, "step": 15370 }, { "epoch": 2.2899910634495084, "grad_norm": 0.4044916033744812, "learning_rate": 4.750856889256613e-05, "loss": 0.7771, "num_input_tokens_seen": 8870440, "step": 15375 }, { "epoch": 2.2907357759904676, "grad_norm": 0.28357622027397156, "learning_rate": 4.750573995599036e-05, "loss": 0.8124, "num_input_tokens_seen": 8873160, "step": 15380 }, { "epoch": 2.291480488531427, "grad_norm": 0.2562362253665924, "learning_rate": 4.7502909498568194e-05, "loss": 0.8384, "num_input_tokens_seen": 8876040, "step": 15385 }, { "epoch": 2.292225201072386, "grad_norm": 0.3847697377204895, "learning_rate": 4.7500077520490884e-05, "loss": 0.7862, "num_input_tokens_seen": 8878728, "step": 15390 }, { "epoch": 2.2929699136133452, "grad_norm": 0.3188895881175995, "learning_rate": 4.749724402194982e-05, "loss": 0.8231, "num_input_tokens_seen": 8881544, "step": 15395 }, { "epoch": 2.2937146261543044, "grad_norm": 0.33503374457359314, "learning_rate": 4.749440900313648e-05, "loss": 0.7938, "num_input_tokens_seen": 8884104, "step": 15400 }, { "epoch": 2.2944593386952636, "grad_norm": 0.3238895833492279, "learning_rate": 4.7491572464242415e-05, "loss": 0.8008, "num_input_tokens_seen": 8886888, "step": 15405 }, { "epoch": 2.295204051236223, "grad_norm": 0.6821622252464294, "learning_rate": 4.748873440545935e-05, "loss": 0.815, "num_input_tokens_seen": 8889928, "step": 15410 }, { "epoch": 2.295948763777182, "grad_norm": 0.26427069306373596, "learning_rate": 4.7485894826979025e-05, "loss": 0.7788, "num_input_tokens_seen": 8893096, "step": 15415 }, { "epoch": 2.2966934763181412, "grad_norm": 0.26384973526000977, "learning_rate": 4.748305372899337e-05, "loss": 0.8046, "num_input_tokens_seen": 8895624, "step": 15420 }, { "epoch": 2.2974381888591004, "grad_norm": 0.37960851192474365, "learning_rate": 4.7480211111694335e-05, "loss": 0.786, "num_input_tokens_seen": 8898408, "step": 15425 }, { "epoch": 2.2981829014000597, "grad_norm": 0.4241921305656433, "learning_rate": 4.747736697527404e-05, "loss": 0.8414, "num_input_tokens_seen": 8901352, "step": 15430 }, { "epoch": 2.298927613941019, "grad_norm": 0.2838003933429718, "learning_rate": 4.747452131992467e-05, "loss": 0.8172, "num_input_tokens_seen": 8904008, "step": 15435 }, { "epoch": 2.299672326481978, "grad_norm": 0.3577616512775421, "learning_rate": 4.747167414583852e-05, "loss": 0.774, "num_input_tokens_seen": 8906920, "step": 15440 }, { "epoch": 2.3004170390229373, "grad_norm": 0.334334135055542, "learning_rate": 4.7468825453208e-05, "loss": 0.7886, "num_input_tokens_seen": 8909832, "step": 15445 }, { "epoch": 2.3011617515638965, "grad_norm": 0.34482666850090027, "learning_rate": 4.74659752422256e-05, "loss": 0.8215, "num_input_tokens_seen": 8912680, "step": 15450 }, { "epoch": 2.3019064641048557, "grad_norm": 0.3224308490753174, "learning_rate": 4.746312351308394e-05, "loss": 0.8236, "num_input_tokens_seen": 8915272, "step": 15455 }, { "epoch": 2.302651176645815, "grad_norm": 0.22628746926784515, "learning_rate": 4.746027026597572e-05, "loss": 0.8236, "num_input_tokens_seen": 8918248, "step": 15460 }, { "epoch": 2.303395889186774, "grad_norm": 0.3461085259914398, "learning_rate": 4.7457415501093746e-05, "loss": 0.811, "num_input_tokens_seen": 8921064, "step": 15465 }, { "epoch": 2.3041406017277333, "grad_norm": 0.4001925587654114, "learning_rate": 4.745455921863095e-05, "loss": 0.8029, "num_input_tokens_seen": 8924136, "step": 15470 }, { "epoch": 2.3048853142686925, "grad_norm": 0.33790040016174316, "learning_rate": 4.7451701418780334e-05, "loss": 0.8242, "num_input_tokens_seen": 8927080, "step": 15475 }, { "epoch": 2.3056300268096512, "grad_norm": 0.45091429352760315, "learning_rate": 4.744884210173501e-05, "loss": 0.8258, "num_input_tokens_seen": 8929768, "step": 15480 }, { "epoch": 2.306374739350611, "grad_norm": 0.4091050922870636, "learning_rate": 4.744598126768821e-05, "loss": 0.8237, "num_input_tokens_seen": 8933224, "step": 15485 }, { "epoch": 2.3071194518915696, "grad_norm": 0.24660621583461761, "learning_rate": 4.744311891683325e-05, "loss": 0.8317, "num_input_tokens_seen": 8935656, "step": 15490 }, { "epoch": 2.3078641644325293, "grad_norm": 0.3010927438735962, "learning_rate": 4.7440255049363566e-05, "loss": 0.7965, "num_input_tokens_seen": 8938440, "step": 15495 }, { "epoch": 2.308608876973488, "grad_norm": 0.35200393199920654, "learning_rate": 4.7437389665472686e-05, "loss": 0.803, "num_input_tokens_seen": 8941288, "step": 15500 }, { "epoch": 2.3093535895144472, "grad_norm": 0.3963358998298645, "learning_rate": 4.7434522765354226e-05, "loss": 0.788, "num_input_tokens_seen": 8944200, "step": 15505 }, { "epoch": 2.3100983020554064, "grad_norm": 0.27950477600097656, "learning_rate": 4.743165434920194e-05, "loss": 0.8143, "num_input_tokens_seen": 8947016, "step": 15510 }, { "epoch": 2.3108430145963657, "grad_norm": 0.32782891392707825, "learning_rate": 4.742878441720965e-05, "loss": 0.8109, "num_input_tokens_seen": 8949896, "step": 15515 }, { "epoch": 2.311587727137325, "grad_norm": 0.3153018653392792, "learning_rate": 4.7425912969571295e-05, "loss": 0.7964, "num_input_tokens_seen": 8952872, "step": 15520 }, { "epoch": 2.312332439678284, "grad_norm": 0.2996128499507904, "learning_rate": 4.742304000648092e-05, "loss": 0.7949, "num_input_tokens_seen": 8955656, "step": 15525 }, { "epoch": 2.3130771522192433, "grad_norm": 0.291543573141098, "learning_rate": 4.742016552813267e-05, "loss": 0.8254, "num_input_tokens_seen": 8958440, "step": 15530 }, { "epoch": 2.3138218647602025, "grad_norm": 0.3405112028121948, "learning_rate": 4.7417289534720774e-05, "loss": 0.8004, "num_input_tokens_seen": 8961192, "step": 15535 }, { "epoch": 2.3145665773011617, "grad_norm": 0.31924813985824585, "learning_rate": 4.74144120264396e-05, "loss": 0.7549, "num_input_tokens_seen": 8964232, "step": 15540 }, { "epoch": 2.315311289842121, "grad_norm": 0.3522207736968994, "learning_rate": 4.74115330034836e-05, "loss": 0.8154, "num_input_tokens_seen": 8967016, "step": 15545 }, { "epoch": 2.31605600238308, "grad_norm": 0.4272305369377136, "learning_rate": 4.7408652466047313e-05, "loss": 0.8241, "num_input_tokens_seen": 8969896, "step": 15550 }, { "epoch": 2.3168007149240393, "grad_norm": 0.3419126272201538, "learning_rate": 4.7405770414325404e-05, "loss": 0.7804, "num_input_tokens_seen": 8973160, "step": 15555 }, { "epoch": 2.3175454274649985, "grad_norm": 0.3454972505569458, "learning_rate": 4.740288684851262e-05, "loss": 0.7854, "num_input_tokens_seen": 8975976, "step": 15560 }, { "epoch": 2.3182901400059577, "grad_norm": 0.430488258600235, "learning_rate": 4.7400001768803826e-05, "loss": 0.8371, "num_input_tokens_seen": 8978920, "step": 15565 }, { "epoch": 2.319034852546917, "grad_norm": 0.28450140357017517, "learning_rate": 4.739711517539398e-05, "loss": 0.7898, "num_input_tokens_seen": 8981736, "step": 15570 }, { "epoch": 2.319779565087876, "grad_norm": 0.30176109075546265, "learning_rate": 4.7394227068478157e-05, "loss": 0.7853, "num_input_tokens_seen": 8984488, "step": 15575 }, { "epoch": 2.3205242776288353, "grad_norm": 0.3263566792011261, "learning_rate": 4.739133744825152e-05, "loss": 0.8204, "num_input_tokens_seen": 8987336, "step": 15580 }, { "epoch": 2.3212689901697945, "grad_norm": 0.27551761269569397, "learning_rate": 4.738844631490933e-05, "loss": 0.8214, "num_input_tokens_seen": 8990440, "step": 15585 }, { "epoch": 2.3220137027107537, "grad_norm": 0.5360879302024841, "learning_rate": 4.738555366864696e-05, "loss": 0.8135, "num_input_tokens_seen": 8993160, "step": 15590 }, { "epoch": 2.322758415251713, "grad_norm": 0.38924074172973633, "learning_rate": 4.738265950965989e-05, "loss": 0.7892, "num_input_tokens_seen": 8995976, "step": 15595 }, { "epoch": 2.323503127792672, "grad_norm": 0.302392840385437, "learning_rate": 4.73797638381437e-05, "loss": 0.8623, "num_input_tokens_seen": 8998632, "step": 15600 }, { "epoch": 2.3242478403336313, "grad_norm": 0.30124789476394653, "learning_rate": 4.7376866654294047e-05, "loss": 0.7813, "num_input_tokens_seen": 9001384, "step": 15605 }, { "epoch": 2.3249925528745905, "grad_norm": 0.3191213011741638, "learning_rate": 4.7373967958306724e-05, "loss": 0.7852, "num_input_tokens_seen": 9004040, "step": 15610 }, { "epoch": 2.3257372654155497, "grad_norm": 0.3501865267753601, "learning_rate": 4.737106775037762e-05, "loss": 0.7893, "num_input_tokens_seen": 9006984, "step": 15615 }, { "epoch": 2.326481977956509, "grad_norm": 0.43287011981010437, "learning_rate": 4.736816603070271e-05, "loss": 0.8146, "num_input_tokens_seen": 9009928, "step": 15620 }, { "epoch": 2.327226690497468, "grad_norm": 0.3088306188583374, "learning_rate": 4.736526279947807e-05, "loss": 0.8063, "num_input_tokens_seen": 9012840, "step": 15625 }, { "epoch": 2.3279714030384273, "grad_norm": 0.4720247685909271, "learning_rate": 4.736235805689992e-05, "loss": 0.8414, "num_input_tokens_seen": 9015400, "step": 15630 }, { "epoch": 2.3287161155793865, "grad_norm": 0.2990228533744812, "learning_rate": 4.735945180316451e-05, "loss": 0.7769, "num_input_tokens_seen": 9018280, "step": 15635 }, { "epoch": 2.3294608281203457, "grad_norm": 0.27140024304389954, "learning_rate": 4.7356544038468266e-05, "loss": 0.8062, "num_input_tokens_seen": 9021096, "step": 15640 }, { "epoch": 2.330205540661305, "grad_norm": 0.34597909450531006, "learning_rate": 4.735363476300767e-05, "loss": 0.782, "num_input_tokens_seen": 9023976, "step": 15645 }, { "epoch": 2.330950253202264, "grad_norm": 0.39913448691368103, "learning_rate": 4.735072397697932e-05, "loss": 0.85, "num_input_tokens_seen": 9026920, "step": 15650 }, { "epoch": 2.331694965743223, "grad_norm": 0.37896043062210083, "learning_rate": 4.734781168057991e-05, "loss": 0.8123, "num_input_tokens_seen": 9029576, "step": 15655 }, { "epoch": 2.3324396782841825, "grad_norm": 0.3143155872821808, "learning_rate": 4.734489787400626e-05, "loss": 0.8298, "num_input_tokens_seen": 9032296, "step": 15660 }, { "epoch": 2.3331843908251413, "grad_norm": 0.40202391147613525, "learning_rate": 4.7341982557455245e-05, "loss": 0.7936, "num_input_tokens_seen": 9035272, "step": 15665 }, { "epoch": 2.333929103366101, "grad_norm": 0.36903342604637146, "learning_rate": 4.733906573112389e-05, "loss": 0.8312, "num_input_tokens_seen": 9038120, "step": 15670 }, { "epoch": 2.3346738159070597, "grad_norm": 0.3529183268547058, "learning_rate": 4.7336147395209294e-05, "loss": 0.8227, "num_input_tokens_seen": 9040840, "step": 15675 }, { "epoch": 2.335418528448019, "grad_norm": 0.41561436653137207, "learning_rate": 4.733322754990867e-05, "loss": 0.8013, "num_input_tokens_seen": 9043848, "step": 15680 }, { "epoch": 2.336163240988978, "grad_norm": 0.2534521520137787, "learning_rate": 4.733030619541934e-05, "loss": 0.7747, "num_input_tokens_seen": 9046696, "step": 15685 }, { "epoch": 2.3369079535299373, "grad_norm": 0.38724642992019653, "learning_rate": 4.732738333193869e-05, "loss": 0.805, "num_input_tokens_seen": 9049928, "step": 15690 }, { "epoch": 2.3376526660708965, "grad_norm": 0.3711051046848297, "learning_rate": 4.7324458959664256e-05, "loss": 0.8362, "num_input_tokens_seen": 9052872, "step": 15695 }, { "epoch": 2.3383973786118557, "grad_norm": 0.32247382402420044, "learning_rate": 4.7321533078793655e-05, "loss": 0.793, "num_input_tokens_seen": 9055624, "step": 15700 }, { "epoch": 2.339142091152815, "grad_norm": 0.43033191561698914, "learning_rate": 4.73186056895246e-05, "loss": 0.8004, "num_input_tokens_seen": 9058280, "step": 15705 }, { "epoch": 2.339886803693774, "grad_norm": 0.2375713437795639, "learning_rate": 4.731567679205491e-05, "loss": 0.815, "num_input_tokens_seen": 9061000, "step": 15710 }, { "epoch": 2.3406315162347333, "grad_norm": 0.3111611604690552, "learning_rate": 4.731274638658251e-05, "loss": 0.7973, "num_input_tokens_seen": 9064296, "step": 15715 }, { "epoch": 2.3413762287756925, "grad_norm": 0.3356780409812927, "learning_rate": 4.7309814473305427e-05, "loss": 0.8197, "num_input_tokens_seen": 9066856, "step": 15720 }, { "epoch": 2.3421209413166517, "grad_norm": 0.3215465545654297, "learning_rate": 4.730688105242179e-05, "loss": 0.8081, "num_input_tokens_seen": 9069800, "step": 15725 }, { "epoch": 2.342865653857611, "grad_norm": 0.30584418773651123, "learning_rate": 4.7303946124129824e-05, "loss": 0.8066, "num_input_tokens_seen": 9072744, "step": 15730 }, { "epoch": 2.34361036639857, "grad_norm": 0.2940835952758789, "learning_rate": 4.730100968862786e-05, "loss": 0.7907, "num_input_tokens_seen": 9075528, "step": 15735 }, { "epoch": 2.3443550789395293, "grad_norm": 0.3411778509616852, "learning_rate": 4.7298071746114326e-05, "loss": 0.7722, "num_input_tokens_seen": 9078248, "step": 15740 }, { "epoch": 2.3450997914804885, "grad_norm": 0.3472808301448822, "learning_rate": 4.729513229678776e-05, "loss": 0.7987, "num_input_tokens_seen": 9081096, "step": 15745 }, { "epoch": 2.3458445040214477, "grad_norm": 0.3203125, "learning_rate": 4.7292191340846806e-05, "loss": 0.8199, "num_input_tokens_seen": 9083688, "step": 15750 }, { "epoch": 2.346589216562407, "grad_norm": 0.3111542761325836, "learning_rate": 4.728924887849019e-05, "loss": 0.8126, "num_input_tokens_seen": 9086312, "step": 15755 }, { "epoch": 2.347333929103366, "grad_norm": 0.3254181444644928, "learning_rate": 4.728630490991676e-05, "loss": 0.8044, "num_input_tokens_seen": 9089032, "step": 15760 }, { "epoch": 2.3480786416443253, "grad_norm": 0.3094411790370941, "learning_rate": 4.728335943532545e-05, "loss": 0.7993, "num_input_tokens_seen": 9091816, "step": 15765 }, { "epoch": 2.3488233541852845, "grad_norm": 0.47549980878829956, "learning_rate": 4.7280412454915316e-05, "loss": 0.7885, "num_input_tokens_seen": 9094920, "step": 15770 }, { "epoch": 2.3495680667262437, "grad_norm": 0.4075010418891907, "learning_rate": 4.727746396888548e-05, "loss": 0.8268, "num_input_tokens_seen": 9097960, "step": 15775 }, { "epoch": 2.350312779267203, "grad_norm": 0.3437873125076294, "learning_rate": 4.7274513977435206e-05, "loss": 0.8245, "num_input_tokens_seen": 9100872, "step": 15780 }, { "epoch": 2.351057491808162, "grad_norm": 0.5438387989997864, "learning_rate": 4.7271562480763845e-05, "loss": 0.8285, "num_input_tokens_seen": 9103720, "step": 15785 }, { "epoch": 2.3518022043491214, "grad_norm": 0.29106754064559937, "learning_rate": 4.726860947907084e-05, "loss": 0.8203, "num_input_tokens_seen": 9106632, "step": 15790 }, { "epoch": 2.3525469168900806, "grad_norm": 0.338867723941803, "learning_rate": 4.726565497255575e-05, "loss": 0.8141, "num_input_tokens_seen": 9109800, "step": 15795 }, { "epoch": 2.3532916294310398, "grad_norm": 0.4336128830909729, "learning_rate": 4.7262698961418206e-05, "loss": 0.8151, "num_input_tokens_seen": 9112456, "step": 15800 }, { "epoch": 2.354036341971999, "grad_norm": 0.40418943762779236, "learning_rate": 4.7259741445857994e-05, "loss": 0.7913, "num_input_tokens_seen": 9115528, "step": 15805 }, { "epoch": 2.354781054512958, "grad_norm": 0.32585370540618896, "learning_rate": 4.7256782426074956e-05, "loss": 0.7747, "num_input_tokens_seen": 9118344, "step": 15810 }, { "epoch": 2.3555257670539174, "grad_norm": 0.4551641345024109, "learning_rate": 4.725382190226904e-05, "loss": 0.8559, "num_input_tokens_seen": 9121224, "step": 15815 }, { "epoch": 2.3562704795948766, "grad_norm": 0.34871706366539, "learning_rate": 4.725085987464032e-05, "loss": 0.8319, "num_input_tokens_seen": 9123912, "step": 15820 }, { "epoch": 2.3570151921358358, "grad_norm": 0.2598797082901001, "learning_rate": 4.724789634338897e-05, "loss": 0.8039, "num_input_tokens_seen": 9126600, "step": 15825 }, { "epoch": 2.3577599046767945, "grad_norm": 0.3907880187034607, "learning_rate": 4.7244931308715215e-05, "loss": 0.7975, "num_input_tokens_seen": 9129384, "step": 15830 }, { "epoch": 2.358504617217754, "grad_norm": 0.40262937545776367, "learning_rate": 4.724196477081946e-05, "loss": 0.8066, "num_input_tokens_seen": 9132264, "step": 15835 }, { "epoch": 2.359249329758713, "grad_norm": 0.28560125827789307, "learning_rate": 4.723899672990215e-05, "loss": 0.8188, "num_input_tokens_seen": 9134856, "step": 15840 }, { "epoch": 2.359994042299672, "grad_norm": 0.38004568219184875, "learning_rate": 4.7236027186163856e-05, "loss": 0.7993, "num_input_tokens_seen": 9137768, "step": 15845 }, { "epoch": 2.3607387548406313, "grad_norm": 0.39408910274505615, "learning_rate": 4.7233056139805254e-05, "loss": 0.8408, "num_input_tokens_seen": 9140456, "step": 15850 }, { "epoch": 2.3614834673815905, "grad_norm": 0.3057847321033478, "learning_rate": 4.7230083591027106e-05, "loss": 0.7883, "num_input_tokens_seen": 9143496, "step": 15855 }, { "epoch": 2.3622281799225497, "grad_norm": 0.3968273103237152, "learning_rate": 4.72271095400303e-05, "loss": 0.8228, "num_input_tokens_seen": 9146312, "step": 15860 }, { "epoch": 2.362972892463509, "grad_norm": 0.48675107955932617, "learning_rate": 4.722413398701579e-05, "loss": 0.8156, "num_input_tokens_seen": 9149128, "step": 15865 }, { "epoch": 2.363717605004468, "grad_norm": 0.35529419779777527, "learning_rate": 4.722115693218467e-05, "loss": 0.8048, "num_input_tokens_seen": 9151752, "step": 15870 }, { "epoch": 2.3644623175454274, "grad_norm": 0.3980487585067749, "learning_rate": 4.72181783757381e-05, "loss": 0.792, "num_input_tokens_seen": 9154408, "step": 15875 }, { "epoch": 2.3652070300863866, "grad_norm": 0.4095955789089203, "learning_rate": 4.721519831787737e-05, "loss": 0.7959, "num_input_tokens_seen": 9157736, "step": 15880 }, { "epoch": 2.3659517426273458, "grad_norm": 0.2976531684398651, "learning_rate": 4.721221675880386e-05, "loss": 0.794, "num_input_tokens_seen": 9160552, "step": 15885 }, { "epoch": 2.366696455168305, "grad_norm": 0.38083457946777344, "learning_rate": 4.7209233698719056e-05, "loss": 0.8082, "num_input_tokens_seen": 9163272, "step": 15890 }, { "epoch": 2.367441167709264, "grad_norm": 0.3247082233428955, "learning_rate": 4.7206249137824535e-05, "loss": 0.794, "num_input_tokens_seen": 9166120, "step": 15895 }, { "epoch": 2.3681858802502234, "grad_norm": 0.2983931601047516, "learning_rate": 4.7203263076321966e-05, "loss": 0.7781, "num_input_tokens_seen": 9168968, "step": 15900 }, { "epoch": 2.3689305927911826, "grad_norm": 0.28655678033828735, "learning_rate": 4.720027551441316e-05, "loss": 0.8107, "num_input_tokens_seen": 9171816, "step": 15905 }, { "epoch": 2.3696753053321418, "grad_norm": 0.39007431268692017, "learning_rate": 4.719728645229999e-05, "loss": 0.734, "num_input_tokens_seen": 9174728, "step": 15910 }, { "epoch": 2.370420017873101, "grad_norm": 0.35638415813446045, "learning_rate": 4.719429589018446e-05, "loss": 0.823, "num_input_tokens_seen": 9177544, "step": 15915 }, { "epoch": 2.37116473041406, "grad_norm": 0.4264352321624756, "learning_rate": 4.719130382826864e-05, "loss": 0.827, "num_input_tokens_seen": 9180232, "step": 15920 }, { "epoch": 2.3719094429550194, "grad_norm": 0.37712809443473816, "learning_rate": 4.718831026675473e-05, "loss": 0.7558, "num_input_tokens_seen": 9183048, "step": 15925 }, { "epoch": 2.3726541554959786, "grad_norm": 0.6623470187187195, "learning_rate": 4.718531520584503e-05, "loss": 0.8763, "num_input_tokens_seen": 9185800, "step": 15930 }, { "epoch": 2.373398868036938, "grad_norm": 0.3246614933013916, "learning_rate": 4.718231864574193e-05, "loss": 0.81, "num_input_tokens_seen": 9188904, "step": 15935 }, { "epoch": 2.374143580577897, "grad_norm": 0.39273950457572937, "learning_rate": 4.717932058664791e-05, "loss": 0.81, "num_input_tokens_seen": 9191784, "step": 15940 }, { "epoch": 2.374888293118856, "grad_norm": 0.37011778354644775, "learning_rate": 4.717632102876559e-05, "loss": 0.7799, "num_input_tokens_seen": 9194728, "step": 15945 }, { "epoch": 2.3756330056598154, "grad_norm": 0.41955462098121643, "learning_rate": 4.717331997229765e-05, "loss": 0.829, "num_input_tokens_seen": 9197416, "step": 15950 }, { "epoch": 2.3763777182007746, "grad_norm": 0.3168965280056, "learning_rate": 4.71703174174469e-05, "loss": 0.7904, "num_input_tokens_seen": 9200648, "step": 15955 }, { "epoch": 2.377122430741734, "grad_norm": 0.33625704050064087, "learning_rate": 4.7167313364416234e-05, "loss": 0.7906, "num_input_tokens_seen": 9203464, "step": 15960 }, { "epoch": 2.377867143282693, "grad_norm": 0.346825510263443, "learning_rate": 4.716430781340866e-05, "loss": 0.8163, "num_input_tokens_seen": 9206216, "step": 15965 }, { "epoch": 2.378611855823652, "grad_norm": 0.3609030544757843, "learning_rate": 4.716130076462728e-05, "loss": 0.8197, "num_input_tokens_seen": 9208840, "step": 15970 }, { "epoch": 2.3793565683646114, "grad_norm": 0.3647324740886688, "learning_rate": 4.715829221827529e-05, "loss": 0.844, "num_input_tokens_seen": 9212008, "step": 15975 }, { "epoch": 2.3801012809055706, "grad_norm": 0.25844308733940125, "learning_rate": 4.7155282174556004e-05, "loss": 0.8258, "num_input_tokens_seen": 9214920, "step": 15980 }, { "epoch": 2.38084599344653, "grad_norm": 0.36607611179351807, "learning_rate": 4.7152270633672826e-05, "loss": 0.778, "num_input_tokens_seen": 9217768, "step": 15985 }, { "epoch": 2.381590705987489, "grad_norm": 0.45927271246910095, "learning_rate": 4.7149257595829255e-05, "loss": 0.8523, "num_input_tokens_seen": 9220488, "step": 15990 }, { "epoch": 2.382335418528448, "grad_norm": 0.4602968990802765, "learning_rate": 4.714624306122892e-05, "loss": 0.7973, "num_input_tokens_seen": 9223304, "step": 15995 }, { "epoch": 2.3830801310694074, "grad_norm": 0.3862244486808777, "learning_rate": 4.714322703007551e-05, "loss": 0.8263, "num_input_tokens_seen": 9226280, "step": 16000 }, { "epoch": 2.383824843610366, "grad_norm": 0.3183246850967407, "learning_rate": 4.7140209502572855e-05, "loss": 0.8072, "num_input_tokens_seen": 9229096, "step": 16005 }, { "epoch": 2.384569556151326, "grad_norm": 0.2828125059604645, "learning_rate": 4.713719047892484e-05, "loss": 0.8225, "num_input_tokens_seen": 9232008, "step": 16010 }, { "epoch": 2.3853142686922846, "grad_norm": 0.4989328384399414, "learning_rate": 4.713416995933551e-05, "loss": 0.8117, "num_input_tokens_seen": 9235080, "step": 16015 }, { "epoch": 2.386058981233244, "grad_norm": 0.3143521845340729, "learning_rate": 4.7131147944008965e-05, "loss": 0.8123, "num_input_tokens_seen": 9238056, "step": 16020 }, { "epoch": 2.386803693774203, "grad_norm": 0.3047296106815338, "learning_rate": 4.712812443314941e-05, "loss": 0.8148, "num_input_tokens_seen": 9241192, "step": 16025 }, { "epoch": 2.387548406315162, "grad_norm": 0.30453604459762573, "learning_rate": 4.7125099426961185e-05, "loss": 0.7726, "num_input_tokens_seen": 9243848, "step": 16030 }, { "epoch": 2.3882931188561214, "grad_norm": 0.3993268609046936, "learning_rate": 4.712207292564869e-05, "loss": 0.7803, "num_input_tokens_seen": 9246728, "step": 16035 }, { "epoch": 2.3890378313970806, "grad_norm": 0.26895689964294434, "learning_rate": 4.7119044929416443e-05, "loss": 0.8037, "num_input_tokens_seen": 9249672, "step": 16040 }, { "epoch": 2.38978254393804, "grad_norm": 0.4736969769001007, "learning_rate": 4.7116015438469074e-05, "loss": 0.798, "num_input_tokens_seen": 9252456, "step": 16045 }, { "epoch": 2.390527256478999, "grad_norm": 0.347234845161438, "learning_rate": 4.7112984453011297e-05, "loss": 0.8001, "num_input_tokens_seen": 9255432, "step": 16050 }, { "epoch": 2.391271969019958, "grad_norm": 0.31001055240631104, "learning_rate": 4.7109951973247945e-05, "loss": 0.8044, "num_input_tokens_seen": 9258344, "step": 16055 }, { "epoch": 2.3920166815609174, "grad_norm": 0.2950536608695984, "learning_rate": 4.7106917999383926e-05, "loss": 0.8033, "num_input_tokens_seen": 9261416, "step": 16060 }, { "epoch": 2.3927613941018766, "grad_norm": 0.3829154074192047, "learning_rate": 4.710388253162426e-05, "loss": 0.8068, "num_input_tokens_seen": 9264264, "step": 16065 }, { "epoch": 2.393506106642836, "grad_norm": 0.3642735183238983, "learning_rate": 4.710084557017409e-05, "loss": 0.8159, "num_input_tokens_seen": 9267400, "step": 16070 }, { "epoch": 2.394250819183795, "grad_norm": 0.3729928731918335, "learning_rate": 4.709780711523862e-05, "loss": 0.8423, "num_input_tokens_seen": 9270184, "step": 16075 }, { "epoch": 2.394995531724754, "grad_norm": 0.44748055934906006, "learning_rate": 4.70947671670232e-05, "loss": 0.8257, "num_input_tokens_seen": 9272776, "step": 16080 }, { "epoch": 2.3957402442657134, "grad_norm": 0.32009202241897583, "learning_rate": 4.709172572573325e-05, "loss": 0.8039, "num_input_tokens_seen": 9275816, "step": 16085 }, { "epoch": 2.3964849568066726, "grad_norm": 0.33991122245788574, "learning_rate": 4.708868279157428e-05, "loss": 0.7875, "num_input_tokens_seen": 9278696, "step": 16090 }, { "epoch": 2.397229669347632, "grad_norm": 0.3088815212249756, "learning_rate": 4.7085638364751936e-05, "loss": 0.8042, "num_input_tokens_seen": 9281672, "step": 16095 }, { "epoch": 2.397974381888591, "grad_norm": 0.4519553482532501, "learning_rate": 4.7082592445471954e-05, "loss": 0.8221, "num_input_tokens_seen": 9284552, "step": 16100 }, { "epoch": 2.3987190944295502, "grad_norm": 0.2576882541179657, "learning_rate": 4.7079545033940155e-05, "loss": 0.8, "num_input_tokens_seen": 9287368, "step": 16105 }, { "epoch": 2.3994638069705094, "grad_norm": 0.34907037019729614, "learning_rate": 4.7076496130362455e-05, "loss": 0.8002, "num_input_tokens_seen": 9290376, "step": 16110 }, { "epoch": 2.4002085195114686, "grad_norm": 0.38758599758148193, "learning_rate": 4.707344573494492e-05, "loss": 0.8116, "num_input_tokens_seen": 9292968, "step": 16115 }, { "epoch": 2.400953232052428, "grad_norm": 0.38048994541168213, "learning_rate": 4.7070393847893665e-05, "loss": 0.811, "num_input_tokens_seen": 9295944, "step": 16120 }, { "epoch": 2.401697944593387, "grad_norm": 0.3794372081756592, "learning_rate": 4.706734046941492e-05, "loss": 0.8353, "num_input_tokens_seen": 9298568, "step": 16125 }, { "epoch": 2.4024426571343462, "grad_norm": 0.28533434867858887, "learning_rate": 4.706428559971502e-05, "loss": 0.7958, "num_input_tokens_seen": 9301576, "step": 16130 }, { "epoch": 2.4031873696753054, "grad_norm": 0.33868589997291565, "learning_rate": 4.706122923900042e-05, "loss": 0.8237, "num_input_tokens_seen": 9304648, "step": 16135 }, { "epoch": 2.4039320822162646, "grad_norm": 0.37844663858413696, "learning_rate": 4.705817138747763e-05, "loss": 0.8086, "num_input_tokens_seen": 9307400, "step": 16140 }, { "epoch": 2.404676794757224, "grad_norm": 0.31964215636253357, "learning_rate": 4.7055112045353304e-05, "loss": 0.7859, "num_input_tokens_seen": 9310120, "step": 16145 }, { "epoch": 2.405421507298183, "grad_norm": 0.3302017152309418, "learning_rate": 4.705205121283418e-05, "loss": 0.8149, "num_input_tokens_seen": 9312872, "step": 16150 }, { "epoch": 2.4061662198391423, "grad_norm": 0.35911789536476135, "learning_rate": 4.704898889012709e-05, "loss": 0.8044, "num_input_tokens_seen": 9315752, "step": 16155 }, { "epoch": 2.4069109323801015, "grad_norm": 0.3064684271812439, "learning_rate": 4.704592507743897e-05, "loss": 0.7963, "num_input_tokens_seen": 9318696, "step": 16160 }, { "epoch": 2.4076556449210607, "grad_norm": 0.33924588561058044, "learning_rate": 4.704285977497687e-05, "loss": 0.8244, "num_input_tokens_seen": 9321288, "step": 16165 }, { "epoch": 2.4084003574620194, "grad_norm": 0.3857293128967285, "learning_rate": 4.703979298294793e-05, "loss": 0.7938, "num_input_tokens_seen": 9323848, "step": 16170 }, { "epoch": 2.409145070002979, "grad_norm": 0.28743788599967957, "learning_rate": 4.703672470155938e-05, "loss": 0.8227, "num_input_tokens_seen": 9326856, "step": 16175 }, { "epoch": 2.409889782543938, "grad_norm": 0.5090489983558655, "learning_rate": 4.703365493101857e-05, "loss": 0.808, "num_input_tokens_seen": 9329832, "step": 16180 }, { "epoch": 2.4106344950848975, "grad_norm": 0.3928269147872925, "learning_rate": 4.703058367153295e-05, "loss": 0.7783, "num_input_tokens_seen": 9332616, "step": 16185 }, { "epoch": 2.4113792076258562, "grad_norm": 0.42484796047210693, "learning_rate": 4.702751092331005e-05, "loss": 0.7974, "num_input_tokens_seen": 9335848, "step": 16190 }, { "epoch": 2.4121239201668154, "grad_norm": 0.38635656237602234, "learning_rate": 4.7024436686557516e-05, "loss": 0.7891, "num_input_tokens_seen": 9339176, "step": 16195 }, { "epoch": 2.4128686327077746, "grad_norm": 0.3129091262817383, "learning_rate": 4.70213609614831e-05, "loss": 0.7824, "num_input_tokens_seen": 9342088, "step": 16200 }, { "epoch": 2.413613345248734, "grad_norm": 0.24913384020328522, "learning_rate": 4.701828374829464e-05, "loss": 0.7755, "num_input_tokens_seen": 9344904, "step": 16205 }, { "epoch": 2.414358057789693, "grad_norm": 0.2997592091560364, "learning_rate": 4.701520504720008e-05, "loss": 0.7891, "num_input_tokens_seen": 9347592, "step": 16210 }, { "epoch": 2.4151027703306522, "grad_norm": 0.34778162837028503, "learning_rate": 4.701212485840748e-05, "loss": 0.8053, "num_input_tokens_seen": 9350504, "step": 16215 }, { "epoch": 2.4158474828716114, "grad_norm": 0.3784128427505493, "learning_rate": 4.7009043182124966e-05, "loss": 0.8085, "num_input_tokens_seen": 9353608, "step": 16220 }, { "epoch": 2.4165921954125706, "grad_norm": 0.3401598334312439, "learning_rate": 4.700596001856081e-05, "loss": 0.7672, "num_input_tokens_seen": 9356712, "step": 16225 }, { "epoch": 2.41733690795353, "grad_norm": 0.32111701369285583, "learning_rate": 4.700287536792334e-05, "loss": 0.7875, "num_input_tokens_seen": 9359560, "step": 16230 }, { "epoch": 2.418081620494489, "grad_norm": 0.3675098419189453, "learning_rate": 4.6999789230421e-05, "loss": 0.7534, "num_input_tokens_seen": 9362376, "step": 16235 }, { "epoch": 2.4188263330354483, "grad_norm": 0.3274037837982178, "learning_rate": 4.699670160626236e-05, "loss": 0.7594, "num_input_tokens_seen": 9365224, "step": 16240 }, { "epoch": 2.4195710455764075, "grad_norm": 0.2254800945520401, "learning_rate": 4.699361249565605e-05, "loss": 0.8008, "num_input_tokens_seen": 9368168, "step": 16245 }, { "epoch": 2.4203157581173667, "grad_norm": 0.2919672727584839, "learning_rate": 4.699052189881083e-05, "loss": 0.8203, "num_input_tokens_seen": 9371176, "step": 16250 }, { "epoch": 2.421060470658326, "grad_norm": 0.30227765440940857, "learning_rate": 4.698742981593555e-05, "loss": 0.799, "num_input_tokens_seen": 9374216, "step": 16255 }, { "epoch": 2.421805183199285, "grad_norm": 0.399996817111969, "learning_rate": 4.6984336247239155e-05, "loss": 0.8583, "num_input_tokens_seen": 9377192, "step": 16260 }, { "epoch": 2.4225498957402443, "grad_norm": 0.2725752294063568, "learning_rate": 4.69812411929307e-05, "loss": 0.7956, "num_input_tokens_seen": 9380200, "step": 16265 }, { "epoch": 2.4232946082812035, "grad_norm": 0.3750384747982025, "learning_rate": 4.697814465321934e-05, "loss": 0.7877, "num_input_tokens_seen": 9382984, "step": 16270 }, { "epoch": 2.4240393208221627, "grad_norm": 0.3707553446292877, "learning_rate": 4.6975046628314304e-05, "loss": 0.8336, "num_input_tokens_seen": 9385512, "step": 16275 }, { "epoch": 2.424784033363122, "grad_norm": 0.41179975867271423, "learning_rate": 4.6971947118424976e-05, "loss": 0.8174, "num_input_tokens_seen": 9388520, "step": 16280 }, { "epoch": 2.425528745904081, "grad_norm": 0.4612787365913391, "learning_rate": 4.6968846123760786e-05, "loss": 0.7813, "num_input_tokens_seen": 9391304, "step": 16285 }, { "epoch": 2.4262734584450403, "grad_norm": 0.4657677114009857, "learning_rate": 4.696574364453129e-05, "loss": 0.8321, "num_input_tokens_seen": 9393928, "step": 16290 }, { "epoch": 2.4270181709859995, "grad_norm": 0.2899855077266693, "learning_rate": 4.6962639680946154e-05, "loss": 0.8402, "num_input_tokens_seen": 9396680, "step": 16295 }, { "epoch": 2.4277628835269587, "grad_norm": 0.5664563179016113, "learning_rate": 4.6959534233215116e-05, "loss": 0.8047, "num_input_tokens_seen": 9399752, "step": 16300 }, { "epoch": 2.428507596067918, "grad_norm": 0.368525892496109, "learning_rate": 4.695642730154804e-05, "loss": 0.7833, "num_input_tokens_seen": 9402536, "step": 16305 }, { "epoch": 2.429252308608877, "grad_norm": 0.3311220407485962, "learning_rate": 4.695331888615487e-05, "loss": 0.845, "num_input_tokens_seen": 9405416, "step": 16310 }, { "epoch": 2.4299970211498363, "grad_norm": 0.39370691776275635, "learning_rate": 4.695020898724567e-05, "loss": 0.8048, "num_input_tokens_seen": 9408360, "step": 16315 }, { "epoch": 2.4307417336907955, "grad_norm": 0.3952200710773468, "learning_rate": 4.694709760503059e-05, "loss": 0.8136, "num_input_tokens_seen": 9411432, "step": 16320 }, { "epoch": 2.4314864462317547, "grad_norm": 0.29120510816574097, "learning_rate": 4.694398473971988e-05, "loss": 0.7815, "num_input_tokens_seen": 9414312, "step": 16325 }, { "epoch": 2.432231158772714, "grad_norm": 0.31018784642219543, "learning_rate": 4.6940870391523905e-05, "loss": 0.8076, "num_input_tokens_seen": 9416968, "step": 16330 }, { "epoch": 2.432975871313673, "grad_norm": 0.39057132601737976, "learning_rate": 4.693775456065311e-05, "loss": 0.8356, "num_input_tokens_seen": 9419688, "step": 16335 }, { "epoch": 2.4337205838546323, "grad_norm": 0.32223692536354065, "learning_rate": 4.693463724731805e-05, "loss": 0.8164, "num_input_tokens_seen": 9422760, "step": 16340 }, { "epoch": 2.434465296395591, "grad_norm": 0.3987480401992798, "learning_rate": 4.693151845172939e-05, "loss": 0.8082, "num_input_tokens_seen": 9425512, "step": 16345 }, { "epoch": 2.4352100089365507, "grad_norm": 0.30692625045776367, "learning_rate": 4.692839817409788e-05, "loss": 0.8144, "num_input_tokens_seen": 9428520, "step": 16350 }, { "epoch": 2.4359547214775095, "grad_norm": 0.33150458335876465, "learning_rate": 4.6925276414634375e-05, "loss": 0.8572, "num_input_tokens_seen": 9431432, "step": 16355 }, { "epoch": 2.436699434018469, "grad_norm": 0.3917941451072693, "learning_rate": 4.6922153173549835e-05, "loss": 0.8129, "num_input_tokens_seen": 9434312, "step": 16360 }, { "epoch": 2.437444146559428, "grad_norm": 0.4391461908817291, "learning_rate": 4.691902845105531e-05, "loss": 0.7837, "num_input_tokens_seen": 9437224, "step": 16365 }, { "epoch": 2.438188859100387, "grad_norm": 0.3446193337440491, "learning_rate": 4.6915902247361954e-05, "loss": 0.797, "num_input_tokens_seen": 9439976, "step": 16370 }, { "epoch": 2.4389335716413463, "grad_norm": 0.36628907918930054, "learning_rate": 4.691277456268103e-05, "loss": 0.7957, "num_input_tokens_seen": 9443208, "step": 16375 }, { "epoch": 2.4396782841823055, "grad_norm": 0.3410237431526184, "learning_rate": 4.690964539722389e-05, "loss": 0.8111, "num_input_tokens_seen": 9445864, "step": 16380 }, { "epoch": 2.4404229967232647, "grad_norm": 0.31144124269485474, "learning_rate": 4.6906514751202005e-05, "loss": 0.794, "num_input_tokens_seen": 9448936, "step": 16385 }, { "epoch": 2.441167709264224, "grad_norm": 0.37812554836273193, "learning_rate": 4.690338262482691e-05, "loss": 0.8079, "num_input_tokens_seen": 9451976, "step": 16390 }, { "epoch": 2.441912421805183, "grad_norm": 0.3610036075115204, "learning_rate": 4.690024901831026e-05, "loss": 0.7852, "num_input_tokens_seen": 9454824, "step": 16395 }, { "epoch": 2.4426571343461423, "grad_norm": 0.33706557750701904, "learning_rate": 4.6897113931863837e-05, "loss": 0.7951, "num_input_tokens_seen": 9457704, "step": 16400 }, { "epoch": 2.4434018468871015, "grad_norm": 0.3123951852321625, "learning_rate": 4.6893977365699474e-05, "loss": 0.8305, "num_input_tokens_seen": 9460776, "step": 16405 }, { "epoch": 2.4441465594280607, "grad_norm": 0.3373812735080719, "learning_rate": 4.6890839320029134e-05, "loss": 0.7954, "num_input_tokens_seen": 9463848, "step": 16410 }, { "epoch": 2.44489127196902, "grad_norm": 0.2924342751502991, "learning_rate": 4.688769979506488e-05, "loss": 0.7927, "num_input_tokens_seen": 9466568, "step": 16415 }, { "epoch": 2.445635984509979, "grad_norm": 0.50669926404953, "learning_rate": 4.688455879101885e-05, "loss": 0.8081, "num_input_tokens_seen": 9469800, "step": 16420 }, { "epoch": 2.4463806970509383, "grad_norm": 0.2883927822113037, "learning_rate": 4.688141630810333e-05, "loss": 0.8104, "num_input_tokens_seen": 9472712, "step": 16425 }, { "epoch": 2.4471254095918975, "grad_norm": 0.3989277184009552, "learning_rate": 4.687827234653065e-05, "loss": 0.7965, "num_input_tokens_seen": 9475656, "step": 16430 }, { "epoch": 2.4478701221328567, "grad_norm": 0.33229318261146545, "learning_rate": 4.687512690651328e-05, "loss": 0.8104, "num_input_tokens_seen": 9478312, "step": 16435 }, { "epoch": 2.448614834673816, "grad_norm": 0.35581183433532715, "learning_rate": 4.687197998826376e-05, "loss": 0.795, "num_input_tokens_seen": 9481352, "step": 16440 }, { "epoch": 2.449359547214775, "grad_norm": 0.37571048736572266, "learning_rate": 4.686883159199477e-05, "loss": 0.788, "num_input_tokens_seen": 9484072, "step": 16445 }, { "epoch": 2.4501042597557343, "grad_norm": 0.35929587483406067, "learning_rate": 4.6865681717919047e-05, "loss": 0.8011, "num_input_tokens_seen": 9487080, "step": 16450 }, { "epoch": 2.4508489722966935, "grad_norm": 0.3723979592323303, "learning_rate": 4.686253036624946e-05, "loss": 0.8327, "num_input_tokens_seen": 9489928, "step": 16455 }, { "epoch": 2.4515936848376527, "grad_norm": 0.39557451009750366, "learning_rate": 4.6859377537198945e-05, "loss": 0.8443, "num_input_tokens_seen": 9493128, "step": 16460 }, { "epoch": 2.452338397378612, "grad_norm": 0.4116095006465912, "learning_rate": 4.6856223230980576e-05, "loss": 0.8091, "num_input_tokens_seen": 9496072, "step": 16465 }, { "epoch": 2.453083109919571, "grad_norm": 0.39285844564437866, "learning_rate": 4.6853067447807505e-05, "loss": 0.7979, "num_input_tokens_seen": 9498984, "step": 16470 }, { "epoch": 2.4538278224605303, "grad_norm": 0.2865229845046997, "learning_rate": 4.684991018789298e-05, "loss": 0.8193, "num_input_tokens_seen": 9501864, "step": 16475 }, { "epoch": 2.4545725350014895, "grad_norm": 0.4205085337162018, "learning_rate": 4.6846751451450366e-05, "loss": 0.8233, "num_input_tokens_seen": 9504840, "step": 16480 }, { "epoch": 2.4553172475424487, "grad_norm": 0.348311185836792, "learning_rate": 4.684359123869311e-05, "loss": 0.7976, "num_input_tokens_seen": 9507496, "step": 16485 }, { "epoch": 2.456061960083408, "grad_norm": 0.411248117685318, "learning_rate": 4.684042954983476e-05, "loss": 0.787, "num_input_tokens_seen": 9510376, "step": 16490 }, { "epoch": 2.456806672624367, "grad_norm": 0.31698691844940186, "learning_rate": 4.683726638508899e-05, "loss": 0.8167, "num_input_tokens_seen": 9513160, "step": 16495 }, { "epoch": 2.4575513851653263, "grad_norm": 0.44116538763046265, "learning_rate": 4.6834101744669526e-05, "loss": 0.8709, "num_input_tokens_seen": 9516168, "step": 16500 }, { "epoch": 2.4582960977062855, "grad_norm": 0.3568224012851715, "learning_rate": 4.683093562879024e-05, "loss": 0.7884, "num_input_tokens_seen": 9519048, "step": 16505 }, { "epoch": 2.4590408102472447, "grad_norm": 0.30547434091567993, "learning_rate": 4.682776803766509e-05, "loss": 0.7988, "num_input_tokens_seen": 9521704, "step": 16510 }, { "epoch": 2.459785522788204, "grad_norm": 0.3032124936580658, "learning_rate": 4.682459897150812e-05, "loss": 0.8043, "num_input_tokens_seen": 9524392, "step": 16515 }, { "epoch": 2.4605302353291627, "grad_norm": 0.30619674921035767, "learning_rate": 4.682142843053348e-05, "loss": 0.8031, "num_input_tokens_seen": 9527720, "step": 16520 }, { "epoch": 2.4612749478701224, "grad_norm": 0.3466559946537018, "learning_rate": 4.681825641495543e-05, "loss": 0.8065, "num_input_tokens_seen": 9530568, "step": 16525 }, { "epoch": 2.462019660411081, "grad_norm": 0.4794130325317383, "learning_rate": 4.681508292498832e-05, "loss": 0.807, "num_input_tokens_seen": 9533512, "step": 16530 }, { "epoch": 2.4627643729520408, "grad_norm": 0.23231258988380432, "learning_rate": 4.681190796084659e-05, "loss": 0.811, "num_input_tokens_seen": 9536424, "step": 16535 }, { "epoch": 2.4635090854929995, "grad_norm": 0.31686440110206604, "learning_rate": 4.680873152274481e-05, "loss": 0.8124, "num_input_tokens_seen": 9539176, "step": 16540 }, { "epoch": 2.4642537980339587, "grad_norm": 0.2691118121147156, "learning_rate": 4.680555361089762e-05, "loss": 0.8028, "num_input_tokens_seen": 9541864, "step": 16545 }, { "epoch": 2.464998510574918, "grad_norm": 0.5607085824012756, "learning_rate": 4.680237422551977e-05, "loss": 0.8081, "num_input_tokens_seen": 9545064, "step": 16550 }, { "epoch": 2.465743223115877, "grad_norm": 0.30356037616729736, "learning_rate": 4.679919336682611e-05, "loss": 0.7781, "num_input_tokens_seen": 9548008, "step": 16555 }, { "epoch": 2.4664879356568363, "grad_norm": 0.3730795085430145, "learning_rate": 4.6796011035031596e-05, "loss": 0.8071, "num_input_tokens_seen": 9550632, "step": 16560 }, { "epoch": 2.4672326481977955, "grad_norm": 0.29738593101501465, "learning_rate": 4.6792827230351265e-05, "loss": 0.823, "num_input_tokens_seen": 9553576, "step": 16565 }, { "epoch": 2.4679773607387547, "grad_norm": 0.29013583064079285, "learning_rate": 4.678964195300028e-05, "loss": 0.792, "num_input_tokens_seen": 9556168, "step": 16570 }, { "epoch": 2.468722073279714, "grad_norm": 0.4089330732822418, "learning_rate": 4.678645520319388e-05, "loss": 0.8137, "num_input_tokens_seen": 9558888, "step": 16575 }, { "epoch": 2.469466785820673, "grad_norm": 0.28144463896751404, "learning_rate": 4.678326698114741e-05, "loss": 0.7638, "num_input_tokens_seen": 9561768, "step": 16580 }, { "epoch": 2.4702114983616323, "grad_norm": 0.3542139232158661, "learning_rate": 4.678007728707633e-05, "loss": 0.8504, "num_input_tokens_seen": 9564776, "step": 16585 }, { "epoch": 2.4709562109025915, "grad_norm": 0.3439910411834717, "learning_rate": 4.6776886121196175e-05, "loss": 0.7961, "num_input_tokens_seen": 9567880, "step": 16590 }, { "epoch": 2.4717009234435507, "grad_norm": 0.5121296048164368, "learning_rate": 4.677369348372259e-05, "loss": 0.8523, "num_input_tokens_seen": 9570984, "step": 16595 }, { "epoch": 2.47244563598451, "grad_norm": 0.377210795879364, "learning_rate": 4.677049937487134e-05, "loss": 0.8207, "num_input_tokens_seen": 9573992, "step": 16600 }, { "epoch": 2.473190348525469, "grad_norm": 0.35089853405952454, "learning_rate": 4.6767303794858235e-05, "loss": 0.7914, "num_input_tokens_seen": 9577096, "step": 16605 }, { "epoch": 2.4739350610664284, "grad_norm": 0.37696871161460876, "learning_rate": 4.676410674389925e-05, "loss": 0.8242, "num_input_tokens_seen": 9580168, "step": 16610 }, { "epoch": 2.4746797736073876, "grad_norm": 0.3227851390838623, "learning_rate": 4.676090822221042e-05, "loss": 0.7983, "num_input_tokens_seen": 9582888, "step": 16615 }, { "epoch": 2.4754244861483468, "grad_norm": 0.335214227437973, "learning_rate": 4.6757708230007877e-05, "loss": 0.8001, "num_input_tokens_seen": 9585832, "step": 16620 }, { "epoch": 2.476169198689306, "grad_norm": 0.3652363717556, "learning_rate": 4.6754506767507874e-05, "loss": 0.7947, "num_input_tokens_seen": 9588680, "step": 16625 }, { "epoch": 2.476913911230265, "grad_norm": 0.26014724373817444, "learning_rate": 4.6751303834926755e-05, "loss": 0.7959, "num_input_tokens_seen": 9591304, "step": 16630 }, { "epoch": 2.4776586237712244, "grad_norm": 0.3041701316833496, "learning_rate": 4.674809943248095e-05, "loss": 0.8279, "num_input_tokens_seen": 9594184, "step": 16635 }, { "epoch": 2.4784033363121836, "grad_norm": 0.3693889379501343, "learning_rate": 4.674489356038702e-05, "loss": 0.8378, "num_input_tokens_seen": 9597000, "step": 16640 }, { "epoch": 2.4791480488531428, "grad_norm": 0.31512758135795593, "learning_rate": 4.674168621886158e-05, "loss": 0.7985, "num_input_tokens_seen": 9600040, "step": 16645 }, { "epoch": 2.479892761394102, "grad_norm": 0.32044899463653564, "learning_rate": 4.673847740812138e-05, "loss": 0.8159, "num_input_tokens_seen": 9602984, "step": 16650 }, { "epoch": 2.480637473935061, "grad_norm": 0.296117901802063, "learning_rate": 4.673526712838326e-05, "loss": 0.8019, "num_input_tokens_seen": 9605672, "step": 16655 }, { "epoch": 2.4813821864760204, "grad_norm": 0.3183281719684601, "learning_rate": 4.673205537986416e-05, "loss": 0.7931, "num_input_tokens_seen": 9608232, "step": 16660 }, { "epoch": 2.4821268990169796, "grad_norm": 0.38854891061782837, "learning_rate": 4.672884216278112e-05, "loss": 0.8375, "num_input_tokens_seen": 9610984, "step": 16665 }, { "epoch": 2.482871611557939, "grad_norm": 0.36065709590911865, "learning_rate": 4.672562747735126e-05, "loss": 0.7867, "num_input_tokens_seen": 9614120, "step": 16670 }, { "epoch": 2.483616324098898, "grad_norm": 0.3968534469604492, "learning_rate": 4.6722411323791824e-05, "loss": 0.7671, "num_input_tokens_seen": 9617448, "step": 16675 }, { "epoch": 2.484361036639857, "grad_norm": 0.3130744993686676, "learning_rate": 4.671919370232015e-05, "loss": 0.8233, "num_input_tokens_seen": 9620520, "step": 16680 }, { "epoch": 2.4851057491808164, "grad_norm": 0.39817923307418823, "learning_rate": 4.671597461315367e-05, "loss": 0.8711, "num_input_tokens_seen": 9623272, "step": 16685 }, { "epoch": 2.4858504617217756, "grad_norm": 0.3321145176887512, "learning_rate": 4.6712754056509924e-05, "loss": 0.7934, "num_input_tokens_seen": 9626088, "step": 16690 }, { "epoch": 2.4865951742627344, "grad_norm": 0.31892961263656616, "learning_rate": 4.670953203260653e-05, "loss": 0.7762, "num_input_tokens_seen": 9629288, "step": 16695 }, { "epoch": 2.487339886803694, "grad_norm": 0.39282771944999695, "learning_rate": 4.6706308541661224e-05, "loss": 0.7942, "num_input_tokens_seen": 9631816, "step": 16700 }, { "epoch": 2.4880845993446528, "grad_norm": 0.3902244567871094, "learning_rate": 4.670308358389184e-05, "loss": 0.8667, "num_input_tokens_seen": 9634504, "step": 16705 }, { "epoch": 2.488829311885612, "grad_norm": 0.27065160870552063, "learning_rate": 4.66998571595163e-05, "loss": 0.7757, "num_input_tokens_seen": 9637448, "step": 16710 }, { "epoch": 2.489574024426571, "grad_norm": 0.3883775770664215, "learning_rate": 4.6696629268752647e-05, "loss": 0.7868, "num_input_tokens_seen": 9640072, "step": 16715 }, { "epoch": 2.4903187369675304, "grad_norm": 0.4230045676231384, "learning_rate": 4.6693399911818994e-05, "loss": 0.857, "num_input_tokens_seen": 9642856, "step": 16720 }, { "epoch": 2.4910634495084896, "grad_norm": 0.39860427379608154, "learning_rate": 4.669016908893358e-05, "loss": 0.8176, "num_input_tokens_seen": 9645640, "step": 16725 }, { "epoch": 2.4918081620494488, "grad_norm": 0.40188875794410706, "learning_rate": 4.668693680031472e-05, "loss": 0.8138, "num_input_tokens_seen": 9648264, "step": 16730 }, { "epoch": 2.492552874590408, "grad_norm": 0.32221537828445435, "learning_rate": 4.668370304618084e-05, "loss": 0.7888, "num_input_tokens_seen": 9650728, "step": 16735 }, { "epoch": 2.493297587131367, "grad_norm": 0.3195447027683258, "learning_rate": 4.668046782675048e-05, "loss": 0.797, "num_input_tokens_seen": 9653320, "step": 16740 }, { "epoch": 2.4940422996723264, "grad_norm": 0.25529831647872925, "learning_rate": 4.667723114224224e-05, "loss": 0.8095, "num_input_tokens_seen": 9655848, "step": 16745 }, { "epoch": 2.4947870122132856, "grad_norm": 0.3047111928462982, "learning_rate": 4.6673992992874855e-05, "loss": 0.7971, "num_input_tokens_seen": 9658696, "step": 16750 }, { "epoch": 2.495531724754245, "grad_norm": 0.33083486557006836, "learning_rate": 4.667075337886714e-05, "loss": 0.8161, "num_input_tokens_seen": 9661960, "step": 16755 }, { "epoch": 2.496276437295204, "grad_norm": 0.411718487739563, "learning_rate": 4.6667512300438025e-05, "loss": 0.7878, "num_input_tokens_seen": 9664744, "step": 16760 }, { "epoch": 2.497021149836163, "grad_norm": 0.3139605224132538, "learning_rate": 4.6664269757806525e-05, "loss": 0.8019, "num_input_tokens_seen": 9667528, "step": 16765 }, { "epoch": 2.4977658623771224, "grad_norm": 0.4080588221549988, "learning_rate": 4.6661025751191746e-05, "loss": 0.8449, "num_input_tokens_seen": 9670280, "step": 16770 }, { "epoch": 2.4985105749180816, "grad_norm": 0.24597179889678955, "learning_rate": 4.665778028081292e-05, "loss": 0.7948, "num_input_tokens_seen": 9673128, "step": 16775 }, { "epoch": 2.499255287459041, "grad_norm": 0.33431264758110046, "learning_rate": 4.6654533346889356e-05, "loss": 0.8138, "num_input_tokens_seen": 9675976, "step": 16780 }, { "epoch": 2.5, "grad_norm": 0.41197627782821655, "learning_rate": 4.665128494964047e-05, "loss": 0.7843, "num_input_tokens_seen": 9678632, "step": 16785 }, { "epoch": 2.5, "eval_loss": 0.8061988949775696, "eval_runtime": 45.4118, "eval_samples_per_second": 65.71, "eval_steps_per_second": 16.427, "num_input_tokens_seen": 9678632, "step": 16785 }, { "epoch": 2.500744712540959, "grad_norm": 0.306326299905777, "learning_rate": 4.664803508928577e-05, "loss": 0.8164, "num_input_tokens_seen": 9681448, "step": 16790 }, { "epoch": 2.5014894250819184, "grad_norm": 0.245680570602417, "learning_rate": 4.664478376604488e-05, "loss": 0.7926, "num_input_tokens_seen": 9684456, "step": 16795 }, { "epoch": 2.5022341376228776, "grad_norm": 0.3559504449367523, "learning_rate": 4.6641530980137506e-05, "loss": 0.8362, "num_input_tokens_seen": 9687496, "step": 16800 }, { "epoch": 2.502978850163837, "grad_norm": 0.282922625541687, "learning_rate": 4.663827673178345e-05, "loss": 0.7654, "num_input_tokens_seen": 9690184, "step": 16805 }, { "epoch": 2.503723562704796, "grad_norm": 0.39450299739837646, "learning_rate": 4.6635021021202624e-05, "loss": 0.8324, "num_input_tokens_seen": 9692808, "step": 16810 }, { "epoch": 2.504468275245755, "grad_norm": 0.312913179397583, "learning_rate": 4.6631763848615044e-05, "loss": 0.8191, "num_input_tokens_seen": 9695528, "step": 16815 }, { "epoch": 2.5052129877867144, "grad_norm": 0.2518395483493805, "learning_rate": 4.662850521424081e-05, "loss": 0.8346, "num_input_tokens_seen": 9698440, "step": 16820 }, { "epoch": 2.5059577003276736, "grad_norm": 0.21466948091983795, "learning_rate": 4.662524511830013e-05, "loss": 0.7754, "num_input_tokens_seen": 9701096, "step": 16825 }, { "epoch": 2.506702412868633, "grad_norm": 0.36183977127075195, "learning_rate": 4.662198356101331e-05, "loss": 0.7915, "num_input_tokens_seen": 9704008, "step": 16830 }, { "epoch": 2.507447125409592, "grad_norm": 0.38344958424568176, "learning_rate": 4.6618720542600744e-05, "loss": 0.8106, "num_input_tokens_seen": 9706824, "step": 16835 }, { "epoch": 2.5081918379505512, "grad_norm": 0.3940441906452179, "learning_rate": 4.6615456063282944e-05, "loss": 0.8213, "num_input_tokens_seen": 9709576, "step": 16840 }, { "epoch": 2.5089365504915104, "grad_norm": 0.3209611773490906, "learning_rate": 4.66121901232805e-05, "loss": 0.7893, "num_input_tokens_seen": 9712264, "step": 16845 }, { "epoch": 2.509681263032469, "grad_norm": 0.3488834798336029, "learning_rate": 4.6608922722814116e-05, "loss": 0.7909, "num_input_tokens_seen": 9715112, "step": 16850 }, { "epoch": 2.510425975573429, "grad_norm": 0.359210342168808, "learning_rate": 4.6605653862104596e-05, "loss": 0.8296, "num_input_tokens_seen": 9717832, "step": 16855 }, { "epoch": 2.5111706881143876, "grad_norm": 0.2878257632255554, "learning_rate": 4.660238354137283e-05, "loss": 0.7931, "num_input_tokens_seen": 9720712, "step": 16860 }, { "epoch": 2.5119154006553472, "grad_norm": 0.288821816444397, "learning_rate": 4.6599111760839805e-05, "loss": 0.8044, "num_input_tokens_seen": 9723784, "step": 16865 }, { "epoch": 2.512660113196306, "grad_norm": 0.38947924971580505, "learning_rate": 4.659583852072663e-05, "loss": 0.8323, "num_input_tokens_seen": 9726824, "step": 16870 }, { "epoch": 2.5134048257372656, "grad_norm": 0.2556212544441223, "learning_rate": 4.6592563821254486e-05, "loss": 0.8303, "num_input_tokens_seen": 9729800, "step": 16875 }, { "epoch": 2.5141495382782244, "grad_norm": 0.33640536665916443, "learning_rate": 4.658928766264467e-05, "loss": 0.7989, "num_input_tokens_seen": 9732680, "step": 16880 }, { "epoch": 2.514894250819184, "grad_norm": 0.3191239833831787, "learning_rate": 4.658601004511856e-05, "loss": 0.7845, "num_input_tokens_seen": 9735688, "step": 16885 }, { "epoch": 2.515638963360143, "grad_norm": 0.35872331261634827, "learning_rate": 4.658273096889768e-05, "loss": 0.8115, "num_input_tokens_seen": 9738728, "step": 16890 }, { "epoch": 2.516383675901102, "grad_norm": 0.2977898120880127, "learning_rate": 4.657945043420356e-05, "loss": 0.8187, "num_input_tokens_seen": 9741512, "step": 16895 }, { "epoch": 2.517128388442061, "grad_norm": 0.2765488922595978, "learning_rate": 4.657616844125794e-05, "loss": 0.7797, "num_input_tokens_seen": 9744200, "step": 16900 }, { "epoch": 2.5178731009830204, "grad_norm": 0.3509315252304077, "learning_rate": 4.657288499028256e-05, "loss": 0.8239, "num_input_tokens_seen": 9747432, "step": 16905 }, { "epoch": 2.5186178135239796, "grad_norm": 0.26614609360694885, "learning_rate": 4.656960008149933e-05, "loss": 0.831, "num_input_tokens_seen": 9750536, "step": 16910 }, { "epoch": 2.519362526064939, "grad_norm": 0.31887099146842957, "learning_rate": 4.656631371513022e-05, "loss": 0.7914, "num_input_tokens_seen": 9753320, "step": 16915 }, { "epoch": 2.520107238605898, "grad_norm": 0.2946830689907074, "learning_rate": 4.656302589139732e-05, "loss": 0.7901, "num_input_tokens_seen": 9756040, "step": 16920 }, { "epoch": 2.5208519511468572, "grad_norm": 0.41955673694610596, "learning_rate": 4.655973661052279e-05, "loss": 0.8396, "num_input_tokens_seen": 9759112, "step": 16925 }, { "epoch": 2.5215966636878164, "grad_norm": 0.37643569707870483, "learning_rate": 4.655644587272891e-05, "loss": 0.7944, "num_input_tokens_seen": 9762088, "step": 16930 }, { "epoch": 2.5223413762287756, "grad_norm": 0.3277197480201721, "learning_rate": 4.655315367823806e-05, "loss": 0.815, "num_input_tokens_seen": 9764776, "step": 16935 }, { "epoch": 2.523086088769735, "grad_norm": 0.27887043356895447, "learning_rate": 4.654986002727273e-05, "loss": 0.7965, "num_input_tokens_seen": 9767656, "step": 16940 }, { "epoch": 2.523830801310694, "grad_norm": 0.3206521272659302, "learning_rate": 4.6546564920055455e-05, "loss": 0.8077, "num_input_tokens_seen": 9770376, "step": 16945 }, { "epoch": 2.5245755138516532, "grad_norm": 0.34454014897346497, "learning_rate": 4.654326835680894e-05, "loss": 0.7948, "num_input_tokens_seen": 9773320, "step": 16950 }, { "epoch": 2.5253202263926124, "grad_norm": 0.4315240979194641, "learning_rate": 4.6539970337755936e-05, "loss": 0.8223, "num_input_tokens_seen": 9776264, "step": 16955 }, { "epoch": 2.5260649389335716, "grad_norm": 0.35418975353240967, "learning_rate": 4.6536670863119305e-05, "loss": 0.846, "num_input_tokens_seen": 9779304, "step": 16960 }, { "epoch": 2.526809651474531, "grad_norm": 0.38284432888031006, "learning_rate": 4.6533369933122014e-05, "loss": 0.8068, "num_input_tokens_seen": 9782408, "step": 16965 }, { "epoch": 2.52755436401549, "grad_norm": 0.36519092321395874, "learning_rate": 4.6530067547987145e-05, "loss": 0.8008, "num_input_tokens_seen": 9785384, "step": 16970 }, { "epoch": 2.5282990765564493, "grad_norm": 0.35785648226737976, "learning_rate": 4.652676370793784e-05, "loss": 0.7997, "num_input_tokens_seen": 9788264, "step": 16975 }, { "epoch": 2.5290437890974085, "grad_norm": 0.2794623076915741, "learning_rate": 4.6523458413197364e-05, "loss": 0.8242, "num_input_tokens_seen": 9791048, "step": 16980 }, { "epoch": 2.5297885016383677, "grad_norm": 0.2646716833114624, "learning_rate": 4.6520151663989075e-05, "loss": 0.8103, "num_input_tokens_seen": 9793704, "step": 16985 }, { "epoch": 2.530533214179327, "grad_norm": 0.258474737405777, "learning_rate": 4.6516843460536434e-05, "loss": 0.7987, "num_input_tokens_seen": 9796712, "step": 16990 }, { "epoch": 2.531277926720286, "grad_norm": 0.27640533447265625, "learning_rate": 4.651353380306299e-05, "loss": 0.8122, "num_input_tokens_seen": 9799432, "step": 16995 }, { "epoch": 2.5320226392612453, "grad_norm": 0.3312889337539673, "learning_rate": 4.65102226917924e-05, "loss": 0.7966, "num_input_tokens_seen": 9802600, "step": 17000 }, { "epoch": 2.5327673518022045, "grad_norm": 0.39573749899864197, "learning_rate": 4.650691012694842e-05, "loss": 0.8144, "num_input_tokens_seen": 9805736, "step": 17005 }, { "epoch": 2.5335120643431637, "grad_norm": 0.24294427037239075, "learning_rate": 4.650359610875489e-05, "loss": 0.7827, "num_input_tokens_seen": 9808392, "step": 17010 }, { "epoch": 2.534256776884123, "grad_norm": 0.42567628622055054, "learning_rate": 4.650028063743577e-05, "loss": 0.8476, "num_input_tokens_seen": 9811464, "step": 17015 }, { "epoch": 2.535001489425082, "grad_norm": 0.3846708834171295, "learning_rate": 4.649696371321509e-05, "loss": 0.7985, "num_input_tokens_seen": 9814120, "step": 17020 }, { "epoch": 2.535746201966041, "grad_norm": 0.3221745789051056, "learning_rate": 4.6493645336317e-05, "loss": 0.779, "num_input_tokens_seen": 9816936, "step": 17025 }, { "epoch": 2.5364909145070005, "grad_norm": 0.3220739960670471, "learning_rate": 4.6490325506965746e-05, "loss": 0.8281, "num_input_tokens_seen": 9819720, "step": 17030 }, { "epoch": 2.5372356270479592, "grad_norm": 0.3142620623111725, "learning_rate": 4.648700422538567e-05, "loss": 0.7725, "num_input_tokens_seen": 9822472, "step": 17035 }, { "epoch": 2.537980339588919, "grad_norm": 0.2929942309856415, "learning_rate": 4.648368149180121e-05, "loss": 0.8019, "num_input_tokens_seen": 9825352, "step": 17040 }, { "epoch": 2.5387250521298776, "grad_norm": 0.3171640932559967, "learning_rate": 4.64803573064369e-05, "loss": 0.7973, "num_input_tokens_seen": 9828296, "step": 17045 }, { "epoch": 2.5394697646708373, "grad_norm": 0.3615972101688385, "learning_rate": 4.647703166951738e-05, "loss": 0.7947, "num_input_tokens_seen": 9831048, "step": 17050 }, { "epoch": 2.540214477211796, "grad_norm": 0.3534790873527527, "learning_rate": 4.6473704581267374e-05, "loss": 0.8108, "num_input_tokens_seen": 9833896, "step": 17055 }, { "epoch": 2.5409591897527557, "grad_norm": 0.3585100471973419, "learning_rate": 4.6470376041911715e-05, "loss": 0.7836, "num_input_tokens_seen": 9836904, "step": 17060 }, { "epoch": 2.5417039022937145, "grad_norm": 0.4475565552711487, "learning_rate": 4.646704605167534e-05, "loss": 0.8048, "num_input_tokens_seen": 9839816, "step": 17065 }, { "epoch": 2.5424486148346737, "grad_norm": 0.2949371933937073, "learning_rate": 4.646371461078327e-05, "loss": 0.8079, "num_input_tokens_seen": 9842792, "step": 17070 }, { "epoch": 2.543193327375633, "grad_norm": 0.4817443788051605, "learning_rate": 4.646038171946063e-05, "loss": 0.8163, "num_input_tokens_seen": 9845416, "step": 17075 }, { "epoch": 2.543938039916592, "grad_norm": 0.2733737528324127, "learning_rate": 4.645704737793265e-05, "loss": 0.7611, "num_input_tokens_seen": 9848584, "step": 17080 }, { "epoch": 2.5446827524575513, "grad_norm": 0.3495226204395294, "learning_rate": 4.645371158642464e-05, "loss": 0.7653, "num_input_tokens_seen": 9851816, "step": 17085 }, { "epoch": 2.5454274649985105, "grad_norm": 0.3318015933036804, "learning_rate": 4.645037434516204e-05, "loss": 0.78, "num_input_tokens_seen": 9854728, "step": 17090 }, { "epoch": 2.5461721775394697, "grad_norm": 0.4078737497329712, "learning_rate": 4.644703565437033e-05, "loss": 0.8384, "num_input_tokens_seen": 9857576, "step": 17095 }, { "epoch": 2.546916890080429, "grad_norm": 0.43900564312934875, "learning_rate": 4.644369551427516e-05, "loss": 0.7844, "num_input_tokens_seen": 9860296, "step": 17100 }, { "epoch": 2.547661602621388, "grad_norm": 0.28358837962150574, "learning_rate": 4.6440353925102234e-05, "loss": 0.7861, "num_input_tokens_seen": 9863176, "step": 17105 }, { "epoch": 2.5484063151623473, "grad_norm": 0.2973007559776306, "learning_rate": 4.643701088707736e-05, "loss": 0.8049, "num_input_tokens_seen": 9866152, "step": 17110 }, { "epoch": 2.5491510277033065, "grad_norm": 0.35933664441108704, "learning_rate": 4.643366640042643e-05, "loss": 0.8346, "num_input_tokens_seen": 9868904, "step": 17115 }, { "epoch": 2.5498957402442657, "grad_norm": 0.40600305795669556, "learning_rate": 4.643032046537549e-05, "loss": 0.8005, "num_input_tokens_seen": 9871560, "step": 17120 }, { "epoch": 2.550640452785225, "grad_norm": 0.34931740164756775, "learning_rate": 4.642697308215061e-05, "loss": 0.876, "num_input_tokens_seen": 9874536, "step": 17125 }, { "epoch": 2.551385165326184, "grad_norm": 0.29199865460395813, "learning_rate": 4.6423624250978e-05, "loss": 0.8244, "num_input_tokens_seen": 9877448, "step": 17130 }, { "epoch": 2.5521298778671433, "grad_norm": 0.3152143657207489, "learning_rate": 4.6420273972083985e-05, "loss": 0.8025, "num_input_tokens_seen": 9880296, "step": 17135 }, { "epoch": 2.5528745904081025, "grad_norm": 0.269584983587265, "learning_rate": 4.641692224569493e-05, "loss": 0.8003, "num_input_tokens_seen": 9882920, "step": 17140 }, { "epoch": 2.5536193029490617, "grad_norm": 0.28269392251968384, "learning_rate": 4.641356907203734e-05, "loss": 0.7755, "num_input_tokens_seen": 9885736, "step": 17145 }, { "epoch": 2.554364015490021, "grad_norm": 0.35474732518196106, "learning_rate": 4.6410214451337816e-05, "loss": 0.7721, "num_input_tokens_seen": 9888488, "step": 17150 }, { "epoch": 2.55510872803098, "grad_norm": 0.36213988065719604, "learning_rate": 4.6406858383823056e-05, "loss": 0.7734, "num_input_tokens_seen": 9891368, "step": 17155 }, { "epoch": 2.5558534405719393, "grad_norm": 0.3304959535598755, "learning_rate": 4.640350086971983e-05, "loss": 0.8265, "num_input_tokens_seen": 9894280, "step": 17160 }, { "epoch": 2.5565981531128985, "grad_norm": 0.3605861961841583, "learning_rate": 4.640014190925505e-05, "loss": 0.8567, "num_input_tokens_seen": 9897032, "step": 17165 }, { "epoch": 2.5573428656538577, "grad_norm": 0.36708036065101624, "learning_rate": 4.639678150265567e-05, "loss": 0.8115, "num_input_tokens_seen": 9900392, "step": 17170 }, { "epoch": 2.558087578194817, "grad_norm": 0.3214658200740814, "learning_rate": 4.639341965014879e-05, "loss": 0.8086, "num_input_tokens_seen": 9903432, "step": 17175 }, { "epoch": 2.558832290735776, "grad_norm": 0.3643120229244232, "learning_rate": 4.63900563519616e-05, "loss": 0.7904, "num_input_tokens_seen": 9906280, "step": 17180 }, { "epoch": 2.5595770032767353, "grad_norm": 0.31341537833213806, "learning_rate": 4.638669160832136e-05, "loss": 0.8029, "num_input_tokens_seen": 9909000, "step": 17185 }, { "epoch": 2.5603217158176945, "grad_norm": 0.3756321370601654, "learning_rate": 4.638332541945546e-05, "loss": 0.8192, "num_input_tokens_seen": 9911880, "step": 17190 }, { "epoch": 2.5610664283586537, "grad_norm": 0.26504671573638916, "learning_rate": 4.6379957785591355e-05, "loss": 0.773, "num_input_tokens_seen": 9914856, "step": 17195 }, { "epoch": 2.5618111408996125, "grad_norm": 0.3339313864707947, "learning_rate": 4.6376588706956635e-05, "loss": 0.8358, "num_input_tokens_seen": 9917480, "step": 17200 }, { "epoch": 2.562555853440572, "grad_norm": 0.2723558843135834, "learning_rate": 4.637321818377896e-05, "loss": 0.8232, "num_input_tokens_seen": 9920584, "step": 17205 }, { "epoch": 2.563300565981531, "grad_norm": 0.29959872364997864, "learning_rate": 4.636984621628609e-05, "loss": 0.7815, "num_input_tokens_seen": 9923560, "step": 17210 }, { "epoch": 2.5640452785224905, "grad_norm": 0.3431718349456787, "learning_rate": 4.6366472804705905e-05, "loss": 0.8235, "num_input_tokens_seen": 9926472, "step": 17215 }, { "epoch": 2.5647899910634493, "grad_norm": 0.2939017713069916, "learning_rate": 4.636309794926636e-05, "loss": 0.8103, "num_input_tokens_seen": 9929320, "step": 17220 }, { "epoch": 2.565534703604409, "grad_norm": 0.2965734302997589, "learning_rate": 4.635972165019551e-05, "loss": 0.7847, "num_input_tokens_seen": 9932040, "step": 17225 }, { "epoch": 2.5662794161453677, "grad_norm": 0.304515540599823, "learning_rate": 4.635634390772151e-05, "loss": 0.7833, "num_input_tokens_seen": 9934696, "step": 17230 }, { "epoch": 2.5670241286863273, "grad_norm": 0.3177647888660431, "learning_rate": 4.635296472207262e-05, "loss": 0.7846, "num_input_tokens_seen": 9937416, "step": 17235 }, { "epoch": 2.567768841227286, "grad_norm": 0.4392356872558594, "learning_rate": 4.6349584093477184e-05, "loss": 0.8095, "num_input_tokens_seen": 9940232, "step": 17240 }, { "epoch": 2.5685135537682453, "grad_norm": 0.31843578815460205, "learning_rate": 4.634620202216366e-05, "loss": 0.8235, "num_input_tokens_seen": 9942824, "step": 17245 }, { "epoch": 2.5692582663092045, "grad_norm": 0.2595561742782593, "learning_rate": 4.6342818508360595e-05, "loss": 0.7695, "num_input_tokens_seen": 9945736, "step": 17250 }, { "epoch": 2.5700029788501637, "grad_norm": 0.44901764392852783, "learning_rate": 4.633943355229662e-05, "loss": 0.7749, "num_input_tokens_seen": 9948616, "step": 17255 }, { "epoch": 2.570747691391123, "grad_norm": 0.30234211683273315, "learning_rate": 4.633604715420049e-05, "loss": 0.8057, "num_input_tokens_seen": 9951752, "step": 17260 }, { "epoch": 2.571492403932082, "grad_norm": 0.3373527228832245, "learning_rate": 4.6332659314301034e-05, "loss": 0.8026, "num_input_tokens_seen": 9954728, "step": 17265 }, { "epoch": 2.5722371164730413, "grad_norm": 0.3688420057296753, "learning_rate": 4.63292700328272e-05, "loss": 0.7852, "num_input_tokens_seen": 9957896, "step": 17270 }, { "epoch": 2.5729818290140005, "grad_norm": 0.42342767119407654, "learning_rate": 4.632587931000801e-05, "loss": 0.8536, "num_input_tokens_seen": 9961064, "step": 17275 }, { "epoch": 2.5737265415549597, "grad_norm": 0.27560049295425415, "learning_rate": 4.6322487146072614e-05, "loss": 0.7556, "num_input_tokens_seen": 9964104, "step": 17280 }, { "epoch": 2.574471254095919, "grad_norm": 0.26539862155914307, "learning_rate": 4.6319093541250214e-05, "loss": 0.8028, "num_input_tokens_seen": 9966984, "step": 17285 }, { "epoch": 2.575215966636878, "grad_norm": 0.39991939067840576, "learning_rate": 4.6315698495770155e-05, "loss": 0.8142, "num_input_tokens_seen": 9970056, "step": 17290 }, { "epoch": 2.5759606791778373, "grad_norm": 0.3698884844779968, "learning_rate": 4.6312302009861855e-05, "loss": 0.8263, "num_input_tokens_seen": 9973096, "step": 17295 }, { "epoch": 2.5767053917187965, "grad_norm": 0.5282882452011108, "learning_rate": 4.630890408375483e-05, "loss": 0.8376, "num_input_tokens_seen": 9976040, "step": 17300 }, { "epoch": 2.5774501042597557, "grad_norm": 0.44606417417526245, "learning_rate": 4.630550471767871e-05, "loss": 0.7838, "num_input_tokens_seen": 9979240, "step": 17305 }, { "epoch": 2.578194816800715, "grad_norm": 0.3288032114505768, "learning_rate": 4.6302103911863196e-05, "loss": 0.8048, "num_input_tokens_seen": 9982152, "step": 17310 }, { "epoch": 2.578939529341674, "grad_norm": 0.2459143102169037, "learning_rate": 4.6298701666538114e-05, "loss": 0.788, "num_input_tokens_seen": 9985000, "step": 17315 }, { "epoch": 2.5796842418826333, "grad_norm": 0.24795900285243988, "learning_rate": 4.629529798193336e-05, "loss": 0.7981, "num_input_tokens_seen": 9987880, "step": 17320 }, { "epoch": 2.5804289544235925, "grad_norm": 0.3588034510612488, "learning_rate": 4.629189285827895e-05, "loss": 0.8495, "num_input_tokens_seen": 9990888, "step": 17325 }, { "epoch": 2.5811736669645517, "grad_norm": 0.29052528738975525, "learning_rate": 4.6288486295805e-05, "loss": 0.7883, "num_input_tokens_seen": 9993544, "step": 17330 }, { "epoch": 2.581918379505511, "grad_norm": 0.2639889419078827, "learning_rate": 4.628507829474168e-05, "loss": 0.7752, "num_input_tokens_seen": 9996584, "step": 17335 }, { "epoch": 2.58266309204647, "grad_norm": 0.2672536075115204, "learning_rate": 4.628166885531932e-05, "loss": 0.792, "num_input_tokens_seen": 9999336, "step": 17340 }, { "epoch": 2.5834078045874294, "grad_norm": 0.3639872372150421, "learning_rate": 4.6278257977768305e-05, "loss": 0.8121, "num_input_tokens_seen": 10002184, "step": 17345 }, { "epoch": 2.5841525171283886, "grad_norm": 0.3538002073764801, "learning_rate": 4.627484566231912e-05, "loss": 0.8354, "num_input_tokens_seen": 10005096, "step": 17350 }, { "epoch": 2.5848972296693478, "grad_norm": 0.28189629316329956, "learning_rate": 4.627143190920237e-05, "loss": 0.76, "num_input_tokens_seen": 10008104, "step": 17355 }, { "epoch": 2.585641942210307, "grad_norm": 0.26321080327033997, "learning_rate": 4.626801671864872e-05, "loss": 0.8077, "num_input_tokens_seen": 10010792, "step": 17360 }, { "epoch": 2.586386654751266, "grad_norm": 0.29179322719573975, "learning_rate": 4.6264600090888984e-05, "loss": 0.8385, "num_input_tokens_seen": 10013768, "step": 17365 }, { "epoch": 2.5871313672922254, "grad_norm": 0.35730698704719543, "learning_rate": 4.626118202615403e-05, "loss": 0.8074, "num_input_tokens_seen": 10016488, "step": 17370 }, { "epoch": 2.587876079833184, "grad_norm": 0.27531909942626953, "learning_rate": 4.6257762524674826e-05, "loss": 0.8171, "num_input_tokens_seen": 10019432, "step": 17375 }, { "epoch": 2.5886207923741438, "grad_norm": 0.366830974817276, "learning_rate": 4.625434158668246e-05, "loss": 0.8234, "num_input_tokens_seen": 10022216, "step": 17380 }, { "epoch": 2.5893655049151025, "grad_norm": 0.4283175468444824, "learning_rate": 4.625091921240811e-05, "loss": 0.8245, "num_input_tokens_seen": 10025096, "step": 17385 }, { "epoch": 2.590110217456062, "grad_norm": 0.4858677089214325, "learning_rate": 4.624749540208304e-05, "loss": 0.8502, "num_input_tokens_seen": 10028136, "step": 17390 }, { "epoch": 2.590854929997021, "grad_norm": 0.2818479835987091, "learning_rate": 4.6244070155938614e-05, "loss": 0.7899, "num_input_tokens_seen": 10030888, "step": 17395 }, { "epoch": 2.5915996425379806, "grad_norm": 0.3178914487361908, "learning_rate": 4.624064347420629e-05, "loss": 0.8455, "num_input_tokens_seen": 10033576, "step": 17400 }, { "epoch": 2.5923443550789393, "grad_norm": 0.3818782866001129, "learning_rate": 4.623721535711765e-05, "loss": 0.799, "num_input_tokens_seen": 10036520, "step": 17405 }, { "epoch": 2.593089067619899, "grad_norm": 0.27271267771720886, "learning_rate": 4.623378580490434e-05, "loss": 0.7979, "num_input_tokens_seen": 10039368, "step": 17410 }, { "epoch": 2.5938337801608577, "grad_norm": 0.3539353013038635, "learning_rate": 4.6230354817798104e-05, "loss": 0.8023, "num_input_tokens_seen": 10042216, "step": 17415 }, { "epoch": 2.594578492701817, "grad_norm": 0.3291669487953186, "learning_rate": 4.622692239603082e-05, "loss": 0.8035, "num_input_tokens_seen": 10045352, "step": 17420 }, { "epoch": 2.595323205242776, "grad_norm": 0.3120880126953125, "learning_rate": 4.6223488539834415e-05, "loss": 0.8117, "num_input_tokens_seen": 10048296, "step": 17425 }, { "epoch": 2.5960679177837354, "grad_norm": 0.3605569005012512, "learning_rate": 4.622005324944095e-05, "loss": 0.8217, "num_input_tokens_seen": 10051560, "step": 17430 }, { "epoch": 2.5968126303246946, "grad_norm": 0.3764197528362274, "learning_rate": 4.621661652508255e-05, "loss": 0.8009, "num_input_tokens_seen": 10054504, "step": 17435 }, { "epoch": 2.5975573428656538, "grad_norm": 0.2805749773979187, "learning_rate": 4.621317836699147e-05, "loss": 0.8108, "num_input_tokens_seen": 10057800, "step": 17440 }, { "epoch": 2.598302055406613, "grad_norm": 0.3216180205345154, "learning_rate": 4.6209738775400045e-05, "loss": 0.7925, "num_input_tokens_seen": 10060616, "step": 17445 }, { "epoch": 2.599046767947572, "grad_norm": 0.29673001170158386, "learning_rate": 4.6206297750540706e-05, "loss": 0.7836, "num_input_tokens_seen": 10063464, "step": 17450 }, { "epoch": 2.5997914804885314, "grad_norm": 0.3571628928184509, "learning_rate": 4.620285529264599e-05, "loss": 0.7641, "num_input_tokens_seen": 10066184, "step": 17455 }, { "epoch": 2.6005361930294906, "grad_norm": 0.25883573293685913, "learning_rate": 4.619941140194851e-05, "loss": 0.8346, "num_input_tokens_seen": 10069096, "step": 17460 }, { "epoch": 2.6012809055704498, "grad_norm": 0.33935561776161194, "learning_rate": 4.6195966078680995e-05, "loss": 0.8191, "num_input_tokens_seen": 10071976, "step": 17465 }, { "epoch": 2.602025618111409, "grad_norm": 0.31467312574386597, "learning_rate": 4.619251932307627e-05, "loss": 0.8032, "num_input_tokens_seen": 10074536, "step": 17470 }, { "epoch": 2.602770330652368, "grad_norm": 0.21855132281780243, "learning_rate": 4.618907113536726e-05, "loss": 0.7794, "num_input_tokens_seen": 10077320, "step": 17475 }, { "epoch": 2.6035150431933274, "grad_norm": 0.3201485276222229, "learning_rate": 4.618562151578696e-05, "loss": 0.8229, "num_input_tokens_seen": 10080136, "step": 17480 }, { "epoch": 2.6042597557342866, "grad_norm": 0.2462879717350006, "learning_rate": 4.61821704645685e-05, "loss": 0.7803, "num_input_tokens_seen": 10083208, "step": 17485 }, { "epoch": 2.605004468275246, "grad_norm": 0.30482998490333557, "learning_rate": 4.6178717981945074e-05, "loss": 0.7837, "num_input_tokens_seen": 10086152, "step": 17490 }, { "epoch": 2.605749180816205, "grad_norm": 0.3076217472553253, "learning_rate": 4.617526406815e-05, "loss": 0.7919, "num_input_tokens_seen": 10089096, "step": 17495 }, { "epoch": 2.606493893357164, "grad_norm": 0.3626778721809387, "learning_rate": 4.617180872341667e-05, "loss": 0.8304, "num_input_tokens_seen": 10091912, "step": 17500 }, { "epoch": 2.6072386058981234, "grad_norm": 0.27000343799591064, "learning_rate": 4.616835194797858e-05, "loss": 0.7807, "num_input_tokens_seen": 10094856, "step": 17505 }, { "epoch": 2.6079833184390826, "grad_norm": 0.38287460803985596, "learning_rate": 4.616489374206934e-05, "loss": 0.8247, "num_input_tokens_seen": 10097672, "step": 17510 }, { "epoch": 2.608728030980042, "grad_norm": 0.2527558207511902, "learning_rate": 4.6161434105922616e-05, "loss": 0.82, "num_input_tokens_seen": 10100328, "step": 17515 }, { "epoch": 2.609472743521001, "grad_norm": 0.4045206308364868, "learning_rate": 4.615797303977223e-05, "loss": 0.8056, "num_input_tokens_seen": 10103016, "step": 17520 }, { "epoch": 2.61021745606196, "grad_norm": 0.2636438012123108, "learning_rate": 4.615451054385204e-05, "loss": 0.7792, "num_input_tokens_seen": 10105960, "step": 17525 }, { "epoch": 2.6109621686029194, "grad_norm": 0.32544440031051636, "learning_rate": 4.615104661839603e-05, "loss": 0.7479, "num_input_tokens_seen": 10108712, "step": 17530 }, { "epoch": 2.6117068811438786, "grad_norm": 0.3391639292240143, "learning_rate": 4.6147581263638286e-05, "loss": 0.8159, "num_input_tokens_seen": 10111528, "step": 17535 }, { "epoch": 2.612451593684838, "grad_norm": 0.2828783094882965, "learning_rate": 4.614411447981298e-05, "loss": 0.7987, "num_input_tokens_seen": 10114312, "step": 17540 }, { "epoch": 2.613196306225797, "grad_norm": 0.406059592962265, "learning_rate": 4.6140646267154384e-05, "loss": 0.7876, "num_input_tokens_seen": 10117224, "step": 17545 }, { "epoch": 2.6139410187667558, "grad_norm": 0.23930130898952484, "learning_rate": 4.613717662589687e-05, "loss": 0.7727, "num_input_tokens_seen": 10119912, "step": 17550 }, { "epoch": 2.6146857313077154, "grad_norm": 0.3085716962814331, "learning_rate": 4.613370555627489e-05, "loss": 0.7978, "num_input_tokens_seen": 10123016, "step": 17555 }, { "epoch": 2.615430443848674, "grad_norm": 0.27223849296569824, "learning_rate": 4.6130233058523015e-05, "loss": 0.8343, "num_input_tokens_seen": 10125800, "step": 17560 }, { "epoch": 2.616175156389634, "grad_norm": 0.37040188908576965, "learning_rate": 4.6126759132875896e-05, "loss": 0.8773, "num_input_tokens_seen": 10128392, "step": 17565 }, { "epoch": 2.6169198689305926, "grad_norm": 0.31641173362731934, "learning_rate": 4.612328377956829e-05, "loss": 0.7949, "num_input_tokens_seen": 10130856, "step": 17570 }, { "epoch": 2.6176645814715522, "grad_norm": 0.3027770519256592, "learning_rate": 4.6119806998835056e-05, "loss": 0.7955, "num_input_tokens_seen": 10133512, "step": 17575 }, { "epoch": 2.618409294012511, "grad_norm": 0.31695258617401123, "learning_rate": 4.611632879091112e-05, "loss": 0.7811, "num_input_tokens_seen": 10136392, "step": 17580 }, { "epoch": 2.6191540065534706, "grad_norm": 0.4654964506626129, "learning_rate": 4.6112849156031544e-05, "loss": 0.8652, "num_input_tokens_seen": 10138984, "step": 17585 }, { "epoch": 2.6198987190944294, "grad_norm": 0.34988710284233093, "learning_rate": 4.610936809443146e-05, "loss": 0.8172, "num_input_tokens_seen": 10141800, "step": 17590 }, { "epoch": 2.6206434316353886, "grad_norm": 0.2827449440956116, "learning_rate": 4.610588560634611e-05, "loss": 0.8004, "num_input_tokens_seen": 10144360, "step": 17595 }, { "epoch": 2.621388144176348, "grad_norm": 0.3074379861354828, "learning_rate": 4.610240169201081e-05, "loss": 0.8257, "num_input_tokens_seen": 10147080, "step": 17600 }, { "epoch": 2.622132856717307, "grad_norm": 0.33373335003852844, "learning_rate": 4.6098916351661006e-05, "loss": 0.8132, "num_input_tokens_seen": 10150120, "step": 17605 }, { "epoch": 2.622877569258266, "grad_norm": 0.25577312707901, "learning_rate": 4.609542958553221e-05, "loss": 0.8225, "num_input_tokens_seen": 10152744, "step": 17610 }, { "epoch": 2.6236222817992254, "grad_norm": 0.2104119211435318, "learning_rate": 4.609194139386006e-05, "loss": 0.8183, "num_input_tokens_seen": 10155464, "step": 17615 }, { "epoch": 2.6243669943401846, "grad_norm": 0.3247397840023041, "learning_rate": 4.608845177688026e-05, "loss": 0.7972, "num_input_tokens_seen": 10158248, "step": 17620 }, { "epoch": 2.625111706881144, "grad_norm": 0.3206140697002411, "learning_rate": 4.608496073482863e-05, "loss": 0.8477, "num_input_tokens_seen": 10161000, "step": 17625 }, { "epoch": 2.625856419422103, "grad_norm": 0.26394540071487427, "learning_rate": 4.608146826794107e-05, "loss": 0.806, "num_input_tokens_seen": 10163912, "step": 17630 }, { "epoch": 2.626601131963062, "grad_norm": 0.29410937428474426, "learning_rate": 4.607797437645361e-05, "loss": 0.8327, "num_input_tokens_seen": 10166632, "step": 17635 }, { "epoch": 2.6273458445040214, "grad_norm": 0.31432995200157166, "learning_rate": 4.607447906060233e-05, "loss": 0.7599, "num_input_tokens_seen": 10170024, "step": 17640 }, { "epoch": 2.6280905570449806, "grad_norm": 0.2611856460571289, "learning_rate": 4.607098232062344e-05, "loss": 0.8158, "num_input_tokens_seen": 10172808, "step": 17645 }, { "epoch": 2.62883526958594, "grad_norm": 0.32243168354034424, "learning_rate": 4.6067484156753234e-05, "loss": 0.8063, "num_input_tokens_seen": 10176072, "step": 17650 }, { "epoch": 2.629579982126899, "grad_norm": 0.2425755262374878, "learning_rate": 4.6063984569228103e-05, "loss": 0.8128, "num_input_tokens_seen": 10178920, "step": 17655 }, { "epoch": 2.6303246946678582, "grad_norm": 0.44132304191589355, "learning_rate": 4.606048355828453e-05, "loss": 0.8333, "num_input_tokens_seen": 10182408, "step": 17660 }, { "epoch": 2.6310694072088174, "grad_norm": 0.2724655568599701, "learning_rate": 4.6056981124159104e-05, "loss": 0.8357, "num_input_tokens_seen": 10185256, "step": 17665 }, { "epoch": 2.6318141197497766, "grad_norm": 0.2835652232170105, "learning_rate": 4.605347726708851e-05, "loss": 0.8064, "num_input_tokens_seen": 10188232, "step": 17670 }, { "epoch": 2.632558832290736, "grad_norm": 0.3309086263179779, "learning_rate": 4.604997198730951e-05, "loss": 0.7986, "num_input_tokens_seen": 10191272, "step": 17675 }, { "epoch": 2.633303544831695, "grad_norm": 0.2450607866048813, "learning_rate": 4.6046465285058996e-05, "loss": 0.7967, "num_input_tokens_seen": 10194088, "step": 17680 }, { "epoch": 2.6340482573726542, "grad_norm": 0.4490298330783844, "learning_rate": 4.604295716057393e-05, "loss": 0.817, "num_input_tokens_seen": 10196840, "step": 17685 }, { "epoch": 2.6347929699136134, "grad_norm": 0.34449443221092224, "learning_rate": 4.6039447614091365e-05, "loss": 0.8002, "num_input_tokens_seen": 10199720, "step": 17690 }, { "epoch": 2.6355376824545726, "grad_norm": 0.3336271047592163, "learning_rate": 4.6035936645848476e-05, "loss": 0.7992, "num_input_tokens_seen": 10202760, "step": 17695 }, { "epoch": 2.636282394995532, "grad_norm": 0.3044072389602661, "learning_rate": 4.6032424256082504e-05, "loss": 0.7936, "num_input_tokens_seen": 10205896, "step": 17700 }, { "epoch": 2.637027107536491, "grad_norm": 0.3557107448577881, "learning_rate": 4.602891044503083e-05, "loss": 0.8344, "num_input_tokens_seen": 10208808, "step": 17705 }, { "epoch": 2.6377718200774503, "grad_norm": 0.3510020673274994, "learning_rate": 4.6025395212930864e-05, "loss": 0.8254, "num_input_tokens_seen": 10211944, "step": 17710 }, { "epoch": 2.6385165326184095, "grad_norm": 0.3440496623516083, "learning_rate": 4.602187856002019e-05, "loss": 0.8162, "num_input_tokens_seen": 10214728, "step": 17715 }, { "epoch": 2.6392612451593687, "grad_norm": 0.34804272651672363, "learning_rate": 4.601836048653642e-05, "loss": 0.8522, "num_input_tokens_seen": 10217640, "step": 17720 }, { "epoch": 2.6400059577003274, "grad_norm": 0.27000072598457336, "learning_rate": 4.601484099271731e-05, "loss": 0.8152, "num_input_tokens_seen": 10220456, "step": 17725 }, { "epoch": 2.640750670241287, "grad_norm": 0.4409390091896057, "learning_rate": 4.601132007880068e-05, "loss": 0.8266, "num_input_tokens_seen": 10223656, "step": 17730 }, { "epoch": 2.641495382782246, "grad_norm": 0.24486853182315826, "learning_rate": 4.600779774502447e-05, "loss": 0.8125, "num_input_tokens_seen": 10226600, "step": 17735 }, { "epoch": 2.6422400953232055, "grad_norm": 0.24297502636909485, "learning_rate": 4.60042739916267e-05, "loss": 0.7904, "num_input_tokens_seen": 10229768, "step": 17740 }, { "epoch": 2.6429848078641642, "grad_norm": 0.2784886062145233, "learning_rate": 4.600074881884549e-05, "loss": 0.8158, "num_input_tokens_seen": 10232648, "step": 17745 }, { "epoch": 2.643729520405124, "grad_norm": 0.3186268210411072, "learning_rate": 4.599722222691906e-05, "loss": 0.8102, "num_input_tokens_seen": 10235400, "step": 17750 }, { "epoch": 2.6444742329460826, "grad_norm": 0.2779868245124817, "learning_rate": 4.599369421608571e-05, "loss": 0.7736, "num_input_tokens_seen": 10238312, "step": 17755 }, { "epoch": 2.645218945487042, "grad_norm": 0.31609848141670227, "learning_rate": 4.5990164786583865e-05, "loss": 0.8039, "num_input_tokens_seen": 10241160, "step": 17760 }, { "epoch": 2.645963658028001, "grad_norm": 0.31080859899520874, "learning_rate": 4.598663393865203e-05, "loss": 0.8339, "num_input_tokens_seen": 10244040, "step": 17765 }, { "epoch": 2.6467083705689602, "grad_norm": 0.30430611968040466, "learning_rate": 4.598310167252879e-05, "loss": 0.7999, "num_input_tokens_seen": 10246792, "step": 17770 }, { "epoch": 2.6474530831099194, "grad_norm": 0.2787005305290222, "learning_rate": 4.5979567988452856e-05, "loss": 0.8036, "num_input_tokens_seen": 10249864, "step": 17775 }, { "epoch": 2.6481977956508786, "grad_norm": 0.3302619457244873, "learning_rate": 4.597603288666301e-05, "loss": 0.812, "num_input_tokens_seen": 10252936, "step": 17780 }, { "epoch": 2.648942508191838, "grad_norm": 0.37153416872024536, "learning_rate": 4.597249636739815e-05, "loss": 0.8063, "num_input_tokens_seen": 10255720, "step": 17785 }, { "epoch": 2.649687220732797, "grad_norm": 0.28447720408439636, "learning_rate": 4.5968958430897246e-05, "loss": 0.8019, "num_input_tokens_seen": 10258952, "step": 17790 }, { "epoch": 2.6504319332737563, "grad_norm": 0.3240763247013092, "learning_rate": 4.596541907739939e-05, "loss": 0.857, "num_input_tokens_seen": 10261864, "step": 17795 }, { "epoch": 2.6511766458147155, "grad_norm": 0.3420640826225281, "learning_rate": 4.5961878307143746e-05, "loss": 0.8057, "num_input_tokens_seen": 10264808, "step": 17800 }, { "epoch": 2.6519213583556747, "grad_norm": 0.3915516436100006, "learning_rate": 4.595833612036959e-05, "loss": 0.8065, "num_input_tokens_seen": 10267912, "step": 17805 }, { "epoch": 2.652666070896634, "grad_norm": 0.32283374667167664, "learning_rate": 4.59547925173163e-05, "loss": 0.7976, "num_input_tokens_seen": 10270888, "step": 17810 }, { "epoch": 2.653410783437593, "grad_norm": 0.4492281973361969, "learning_rate": 4.595124749822332e-05, "loss": 0.7886, "num_input_tokens_seen": 10273736, "step": 17815 }, { "epoch": 2.6541554959785523, "grad_norm": 0.36807867884635925, "learning_rate": 4.594770106333022e-05, "loss": 0.8167, "num_input_tokens_seen": 10276648, "step": 17820 }, { "epoch": 2.6549002085195115, "grad_norm": 0.3187723457813263, "learning_rate": 4.594415321287664e-05, "loss": 0.8085, "num_input_tokens_seen": 10279912, "step": 17825 }, { "epoch": 2.6556449210604707, "grad_norm": 0.33781200647354126, "learning_rate": 4.594060394710235e-05, "loss": 0.8366, "num_input_tokens_seen": 10282856, "step": 17830 }, { "epoch": 2.65638963360143, "grad_norm": 0.3017113506793976, "learning_rate": 4.593705326624718e-05, "loss": 0.7908, "num_input_tokens_seen": 10285640, "step": 17835 }, { "epoch": 2.657134346142389, "grad_norm": 0.323471337556839, "learning_rate": 4.593350117055107e-05, "loss": 0.8163, "num_input_tokens_seen": 10288552, "step": 17840 }, { "epoch": 2.6578790586833483, "grad_norm": 0.32238930463790894, "learning_rate": 4.592994766025407e-05, "loss": 0.8032, "num_input_tokens_seen": 10291272, "step": 17845 }, { "epoch": 2.6586237712243075, "grad_norm": 0.3340044319629669, "learning_rate": 4.592639273559629e-05, "loss": 0.7815, "num_input_tokens_seen": 10294312, "step": 17850 }, { "epoch": 2.6593684837652667, "grad_norm": 0.394697368144989, "learning_rate": 4.5922836396817973e-05, "loss": 0.8304, "num_input_tokens_seen": 10297576, "step": 17855 }, { "epoch": 2.660113196306226, "grad_norm": 0.2636543810367584, "learning_rate": 4.591927864415944e-05, "loss": 0.8009, "num_input_tokens_seen": 10300104, "step": 17860 }, { "epoch": 2.660857908847185, "grad_norm": 0.20926211774349213, "learning_rate": 4.591571947786111e-05, "loss": 0.7878, "num_input_tokens_seen": 10303080, "step": 17865 }, { "epoch": 2.6616026213881443, "grad_norm": 0.24612954258918762, "learning_rate": 4.591215889816349e-05, "loss": 0.8041, "num_input_tokens_seen": 10305832, "step": 17870 }, { "epoch": 2.6623473339291035, "grad_norm": 0.3081203103065491, "learning_rate": 4.59085969053072e-05, "loss": 0.8163, "num_input_tokens_seen": 10308360, "step": 17875 }, { "epoch": 2.6630920464700627, "grad_norm": 0.3791804909706116, "learning_rate": 4.5905033499532936e-05, "loss": 0.817, "num_input_tokens_seen": 10311304, "step": 17880 }, { "epoch": 2.663836759011022, "grad_norm": 0.3563494384288788, "learning_rate": 4.590146868108151e-05, "loss": 0.812, "num_input_tokens_seen": 10313928, "step": 17885 }, { "epoch": 2.6645814715519807, "grad_norm": 0.30680051445961, "learning_rate": 4.589790245019379e-05, "loss": 0.7891, "num_input_tokens_seen": 10317064, "step": 17890 }, { "epoch": 2.6653261840929403, "grad_norm": 0.2586878538131714, "learning_rate": 4.5894334807110806e-05, "loss": 0.8207, "num_input_tokens_seen": 10319816, "step": 17895 }, { "epoch": 2.666070896633899, "grad_norm": 0.25897711515426636, "learning_rate": 4.589076575207362e-05, "loss": 0.8465, "num_input_tokens_seen": 10322472, "step": 17900 }, { "epoch": 2.6668156091748587, "grad_norm": 0.3083900809288025, "learning_rate": 4.588719528532342e-05, "loss": 0.8036, "num_input_tokens_seen": 10325352, "step": 17905 }, { "epoch": 2.6675603217158175, "grad_norm": 0.3351230025291443, "learning_rate": 4.5883623407101475e-05, "loss": 0.7853, "num_input_tokens_seen": 10328584, "step": 17910 }, { "epoch": 2.668305034256777, "grad_norm": 0.2798704504966736, "learning_rate": 4.5880050117649174e-05, "loss": 0.7881, "num_input_tokens_seen": 10331304, "step": 17915 }, { "epoch": 2.669049746797736, "grad_norm": 0.282979816198349, "learning_rate": 4.5876475417207974e-05, "loss": 0.7497, "num_input_tokens_seen": 10334248, "step": 17920 }, { "epoch": 2.6697944593386955, "grad_norm": 0.26251474022865295, "learning_rate": 4.5872899306019454e-05, "loss": 0.7897, "num_input_tokens_seen": 10336968, "step": 17925 }, { "epoch": 2.6705391718796543, "grad_norm": 0.31411314010620117, "learning_rate": 4.586932178432525e-05, "loss": 0.87, "num_input_tokens_seen": 10339560, "step": 17930 }, { "epoch": 2.6712838844206135, "grad_norm": 0.27141499519348145, "learning_rate": 4.586574285236714e-05, "loss": 0.8508, "num_input_tokens_seen": 10342504, "step": 17935 }, { "epoch": 2.6720285969615727, "grad_norm": 0.2902992069721222, "learning_rate": 4.586216251038695e-05, "loss": 0.8229, "num_input_tokens_seen": 10345320, "step": 17940 }, { "epoch": 2.672773309502532, "grad_norm": 0.28086400032043457, "learning_rate": 4.585858075862665e-05, "loss": 0.7932, "num_input_tokens_seen": 10348200, "step": 17945 }, { "epoch": 2.673518022043491, "grad_norm": 0.40533941984176636, "learning_rate": 4.585499759732825e-05, "loss": 0.7967, "num_input_tokens_seen": 10351304, "step": 17950 }, { "epoch": 2.6742627345844503, "grad_norm": 0.23132066428661346, "learning_rate": 4.585141302673392e-05, "loss": 0.8013, "num_input_tokens_seen": 10354024, "step": 17955 }, { "epoch": 2.6750074471254095, "grad_norm": 0.28353366255760193, "learning_rate": 4.584782704708587e-05, "loss": 0.7885, "num_input_tokens_seen": 10357448, "step": 17960 }, { "epoch": 2.6757521596663687, "grad_norm": 0.3744037449359894, "learning_rate": 4.584423965862642e-05, "loss": 0.8361, "num_input_tokens_seen": 10360424, "step": 17965 }, { "epoch": 2.676496872207328, "grad_norm": 0.37257686257362366, "learning_rate": 4.5840650861598e-05, "loss": 0.8153, "num_input_tokens_seen": 10363464, "step": 17970 }, { "epoch": 2.677241584748287, "grad_norm": 0.238259956240654, "learning_rate": 4.583706065624314e-05, "loss": 0.8247, "num_input_tokens_seen": 10366536, "step": 17975 }, { "epoch": 2.6779862972892463, "grad_norm": 0.31120380759239197, "learning_rate": 4.583346904280442e-05, "loss": 0.8255, "num_input_tokens_seen": 10369416, "step": 17980 }, { "epoch": 2.6787310098302055, "grad_norm": 0.2948579490184784, "learning_rate": 4.582987602152458e-05, "loss": 0.8202, "num_input_tokens_seen": 10372360, "step": 17985 }, { "epoch": 2.6794757223711647, "grad_norm": 0.33614909648895264, "learning_rate": 4.58262815926464e-05, "loss": 0.789, "num_input_tokens_seen": 10375112, "step": 17990 }, { "epoch": 2.680220434912124, "grad_norm": 0.2607766091823578, "learning_rate": 4.5822685756412785e-05, "loss": 0.8005, "num_input_tokens_seen": 10378088, "step": 17995 }, { "epoch": 2.680965147453083, "grad_norm": 0.39594221115112305, "learning_rate": 4.5819088513066725e-05, "loss": 0.8232, "num_input_tokens_seen": 10380904, "step": 18000 }, { "epoch": 2.6817098599940423, "grad_norm": 0.3642502427101135, "learning_rate": 4.581548986285131e-05, "loss": 0.8449, "num_input_tokens_seen": 10383624, "step": 18005 }, { "epoch": 2.6824545725350015, "grad_norm": 0.2703346312046051, "learning_rate": 4.5811889806009716e-05, "loss": 0.8113, "num_input_tokens_seen": 10386536, "step": 18010 }, { "epoch": 2.6831992850759607, "grad_norm": 0.3621342182159424, "learning_rate": 4.580828834278523e-05, "loss": 0.8165, "num_input_tokens_seen": 10389448, "step": 18015 }, { "epoch": 2.68394399761692, "grad_norm": 0.365070641040802, "learning_rate": 4.580468547342121e-05, "loss": 0.815, "num_input_tokens_seen": 10392104, "step": 18020 }, { "epoch": 2.684688710157879, "grad_norm": 0.3044753074645996, "learning_rate": 4.5801081198161134e-05, "loss": 0.8135, "num_input_tokens_seen": 10394920, "step": 18025 }, { "epoch": 2.6854334226988383, "grad_norm": 0.29169929027557373, "learning_rate": 4.5797475517248565e-05, "loss": 0.7924, "num_input_tokens_seen": 10397448, "step": 18030 }, { "epoch": 2.6861781352397975, "grad_norm": 0.32275959849357605, "learning_rate": 4.579386843092715e-05, "loss": 0.8171, "num_input_tokens_seen": 10399880, "step": 18035 }, { "epoch": 2.6869228477807567, "grad_norm": 0.3432374596595764, "learning_rate": 4.579025993944065e-05, "loss": 0.7853, "num_input_tokens_seen": 10402568, "step": 18040 }, { "epoch": 2.687667560321716, "grad_norm": 0.26130348443984985, "learning_rate": 4.578665004303292e-05, "loss": 0.8342, "num_input_tokens_seen": 10405384, "step": 18045 }, { "epoch": 2.688412272862675, "grad_norm": 0.350386381149292, "learning_rate": 4.578303874194789e-05, "loss": 0.7979, "num_input_tokens_seen": 10408264, "step": 18050 }, { "epoch": 2.6891569854036343, "grad_norm": 0.28370431065559387, "learning_rate": 4.577942603642959e-05, "loss": 0.8292, "num_input_tokens_seen": 10411112, "step": 18055 }, { "epoch": 2.6899016979445936, "grad_norm": 0.3724963665008545, "learning_rate": 4.5775811926722166e-05, "loss": 0.7885, "num_input_tokens_seen": 10413800, "step": 18060 }, { "epoch": 2.6906464104855523, "grad_norm": 0.2816362977027893, "learning_rate": 4.577219641306984e-05, "loss": 0.8153, "num_input_tokens_seen": 10416456, "step": 18065 }, { "epoch": 2.691391123026512, "grad_norm": 0.2447807788848877, "learning_rate": 4.5768579495716935e-05, "loss": 0.8346, "num_input_tokens_seen": 10419432, "step": 18070 }, { "epoch": 2.6921358355674707, "grad_norm": 0.25904446840286255, "learning_rate": 4.5764961174907865e-05, "loss": 0.802, "num_input_tokens_seen": 10422568, "step": 18075 }, { "epoch": 2.6928805481084304, "grad_norm": 0.26530665159225464, "learning_rate": 4.576134145088715e-05, "loss": 0.7983, "num_input_tokens_seen": 10425192, "step": 18080 }, { "epoch": 2.693625260649389, "grad_norm": 0.30117276310920715, "learning_rate": 4.575772032389938e-05, "loss": 0.822, "num_input_tokens_seen": 10428040, "step": 18085 }, { "epoch": 2.6943699731903488, "grad_norm": 0.24072177708148956, "learning_rate": 4.575409779418927e-05, "loss": 0.7867, "num_input_tokens_seen": 10430824, "step": 18090 }, { "epoch": 2.6951146857313075, "grad_norm": 0.34847885370254517, "learning_rate": 4.5750473862001606e-05, "loss": 0.8051, "num_input_tokens_seen": 10433608, "step": 18095 }, { "epoch": 2.695859398272267, "grad_norm": 0.2599734365940094, "learning_rate": 4.5746848527581287e-05, "loss": 0.8113, "num_input_tokens_seen": 10436392, "step": 18100 }, { "epoch": 2.696604110813226, "grad_norm": 0.2712426781654358, "learning_rate": 4.5743221791173296e-05, "loss": 0.7663, "num_input_tokens_seen": 10439080, "step": 18105 }, { "epoch": 2.697348823354185, "grad_norm": 0.3861156404018402, "learning_rate": 4.573959365302272e-05, "loss": 0.8491, "num_input_tokens_seen": 10442024, "step": 18110 }, { "epoch": 2.6980935358951443, "grad_norm": 0.2786107063293457, "learning_rate": 4.5735964113374715e-05, "loss": 0.8355, "num_input_tokens_seen": 10444744, "step": 18115 }, { "epoch": 2.6988382484361035, "grad_norm": 0.33306896686553955, "learning_rate": 4.573233317247456e-05, "loss": 0.7843, "num_input_tokens_seen": 10447624, "step": 18120 }, { "epoch": 2.6995829609770627, "grad_norm": 0.28226709365844727, "learning_rate": 4.572870083056763e-05, "loss": 0.8255, "num_input_tokens_seen": 10450536, "step": 18125 }, { "epoch": 2.700327673518022, "grad_norm": 0.2654961049556732, "learning_rate": 4.5725067087899364e-05, "loss": 0.7907, "num_input_tokens_seen": 10453352, "step": 18130 }, { "epoch": 2.701072386058981, "grad_norm": 0.38273146748542786, "learning_rate": 4.572143194471533e-05, "loss": 0.8126, "num_input_tokens_seen": 10456648, "step": 18135 }, { "epoch": 2.7018170985999403, "grad_norm": 0.3146129548549652, "learning_rate": 4.5717795401261175e-05, "loss": 0.8034, "num_input_tokens_seen": 10459656, "step": 18140 }, { "epoch": 2.7025618111408996, "grad_norm": 0.3509639799594879, "learning_rate": 4.571415745778264e-05, "loss": 0.8409, "num_input_tokens_seen": 10462696, "step": 18145 }, { "epoch": 2.7033065236818588, "grad_norm": 0.31335967779159546, "learning_rate": 4.571051811452556e-05, "loss": 0.7776, "num_input_tokens_seen": 10465736, "step": 18150 }, { "epoch": 2.704051236222818, "grad_norm": 0.45052266120910645, "learning_rate": 4.570687737173587e-05, "loss": 0.8216, "num_input_tokens_seen": 10468552, "step": 18155 }, { "epoch": 2.704795948763777, "grad_norm": 0.3125212788581848, "learning_rate": 4.570323522965959e-05, "loss": 0.8242, "num_input_tokens_seen": 10471432, "step": 18160 }, { "epoch": 2.7055406613047364, "grad_norm": 0.3821723163127899, "learning_rate": 4.569959168854285e-05, "loss": 0.7961, "num_input_tokens_seen": 10473864, "step": 18165 }, { "epoch": 2.7062853738456956, "grad_norm": 0.290009081363678, "learning_rate": 4.5695946748631866e-05, "loss": 0.797, "num_input_tokens_seen": 10476456, "step": 18170 }, { "epoch": 2.7070300863866548, "grad_norm": 0.2065746784210205, "learning_rate": 4.5692300410172936e-05, "loss": 0.783, "num_input_tokens_seen": 10479336, "step": 18175 }, { "epoch": 2.707774798927614, "grad_norm": 0.302727073431015, "learning_rate": 4.568865267341248e-05, "loss": 0.8086, "num_input_tokens_seen": 10481960, "step": 18180 }, { "epoch": 2.708519511468573, "grad_norm": 0.2652549147605896, "learning_rate": 4.5685003538596994e-05, "loss": 0.8131, "num_input_tokens_seen": 10484744, "step": 18185 }, { "epoch": 2.7092642240095324, "grad_norm": 0.2916005551815033, "learning_rate": 4.568135300597306e-05, "loss": 0.8162, "num_input_tokens_seen": 10487144, "step": 18190 }, { "epoch": 2.7100089365504916, "grad_norm": 0.3073422312736511, "learning_rate": 4.567770107578737e-05, "loss": 0.8022, "num_input_tokens_seen": 10490120, "step": 18195 }, { "epoch": 2.710753649091451, "grad_norm": 0.33806371688842773, "learning_rate": 4.567404774828672e-05, "loss": 0.8358, "num_input_tokens_seen": 10493192, "step": 18200 }, { "epoch": 2.71149836163241, "grad_norm": 0.2385057657957077, "learning_rate": 4.567039302371797e-05, "loss": 0.8015, "num_input_tokens_seen": 10496264, "step": 18205 }, { "epoch": 2.712243074173369, "grad_norm": 0.3310653269290924, "learning_rate": 4.566673690232811e-05, "loss": 0.8099, "num_input_tokens_seen": 10499112, "step": 18210 }, { "epoch": 2.7129877867143284, "grad_norm": 0.3746158182621002, "learning_rate": 4.566307938436419e-05, "loss": 0.7691, "num_input_tokens_seen": 10501896, "step": 18215 }, { "epoch": 2.7137324992552876, "grad_norm": 0.3932769000530243, "learning_rate": 4.565942047007337e-05, "loss": 0.8171, "num_input_tokens_seen": 10504968, "step": 18220 }, { "epoch": 2.714477211796247, "grad_norm": 0.2914234399795532, "learning_rate": 4.5655760159702914e-05, "loss": 0.82, "num_input_tokens_seen": 10508136, "step": 18225 }, { "epoch": 2.715221924337206, "grad_norm": 0.3190940320491791, "learning_rate": 4.565209845350017e-05, "loss": 0.8264, "num_input_tokens_seen": 10511176, "step": 18230 }, { "epoch": 2.715966636878165, "grad_norm": 0.35905104875564575, "learning_rate": 4.564843535171257e-05, "loss": 0.8034, "num_input_tokens_seen": 10513896, "step": 18235 }, { "epoch": 2.716711349419124, "grad_norm": 0.28951507806777954, "learning_rate": 4.5644770854587666e-05, "loss": 0.7717, "num_input_tokens_seen": 10516808, "step": 18240 }, { "epoch": 2.7174560619600836, "grad_norm": 0.278279572725296, "learning_rate": 4.564110496237308e-05, "loss": 0.8103, "num_input_tokens_seen": 10519464, "step": 18245 }, { "epoch": 2.7182007745010424, "grad_norm": 0.3118496835231781, "learning_rate": 4.563743767531654e-05, "loss": 0.818, "num_input_tokens_seen": 10522152, "step": 18250 }, { "epoch": 2.718945487042002, "grad_norm": 0.34991899132728577, "learning_rate": 4.563376899366587e-05, "loss": 0.7977, "num_input_tokens_seen": 10525064, "step": 18255 }, { "epoch": 2.7196901995829608, "grad_norm": 0.30105552077293396, "learning_rate": 4.5630098917668986e-05, "loss": 0.8019, "num_input_tokens_seen": 10527752, "step": 18260 }, { "epoch": 2.7204349121239204, "grad_norm": 0.27827176451683044, "learning_rate": 4.5626427447573884e-05, "loss": 0.7988, "num_input_tokens_seen": 10530568, "step": 18265 }, { "epoch": 2.721179624664879, "grad_norm": 0.29597383737564087, "learning_rate": 4.562275458362868e-05, "loss": 0.8253, "num_input_tokens_seen": 10533448, "step": 18270 }, { "epoch": 2.721924337205839, "grad_norm": 0.24581260979175568, "learning_rate": 4.561908032608157e-05, "loss": 0.8057, "num_input_tokens_seen": 10536264, "step": 18275 }, { "epoch": 2.7226690497467976, "grad_norm": 0.3770845830440521, "learning_rate": 4.561540467518084e-05, "loss": 0.8014, "num_input_tokens_seen": 10539688, "step": 18280 }, { "epoch": 2.723413762287757, "grad_norm": 0.4673616290092468, "learning_rate": 4.561172763117488e-05, "loss": 0.8147, "num_input_tokens_seen": 10542568, "step": 18285 }, { "epoch": 2.724158474828716, "grad_norm": 0.3041534125804901, "learning_rate": 4.5608049194312165e-05, "loss": 0.8419, "num_input_tokens_seen": 10545416, "step": 18290 }, { "epoch": 2.724903187369675, "grad_norm": 0.34660378098487854, "learning_rate": 4.560436936484127e-05, "loss": 0.7961, "num_input_tokens_seen": 10548488, "step": 18295 }, { "epoch": 2.7256478999106344, "grad_norm": 0.36913394927978516, "learning_rate": 4.5600688143010874e-05, "loss": 0.7893, "num_input_tokens_seen": 10551144, "step": 18300 }, { "epoch": 2.7263926124515936, "grad_norm": 0.37172576785087585, "learning_rate": 4.559700552906972e-05, "loss": 0.8375, "num_input_tokens_seen": 10554152, "step": 18305 }, { "epoch": 2.727137324992553, "grad_norm": 0.1984376311302185, "learning_rate": 4.559332152326667e-05, "loss": 0.8236, "num_input_tokens_seen": 10556936, "step": 18310 }, { "epoch": 2.727882037533512, "grad_norm": 0.22658592462539673, "learning_rate": 4.558963612585069e-05, "loss": 0.7985, "num_input_tokens_seen": 10559688, "step": 18315 }, { "epoch": 2.728626750074471, "grad_norm": 0.30319729447364807, "learning_rate": 4.558594933707081e-05, "loss": 0.8159, "num_input_tokens_seen": 10562952, "step": 18320 }, { "epoch": 2.7293714626154304, "grad_norm": 0.2992922067642212, "learning_rate": 4.5582261157176164e-05, "loss": 0.8036, "num_input_tokens_seen": 10565672, "step": 18325 }, { "epoch": 2.7301161751563896, "grad_norm": 0.2964217960834503, "learning_rate": 4.5578571586416e-05, "loss": 0.7853, "num_input_tokens_seen": 10568712, "step": 18330 }, { "epoch": 2.730860887697349, "grad_norm": 0.28040310740470886, "learning_rate": 4.557488062503962e-05, "loss": 0.832, "num_input_tokens_seen": 10571368, "step": 18335 }, { "epoch": 2.731605600238308, "grad_norm": 0.32391634583473206, "learning_rate": 4.557118827329647e-05, "loss": 0.7872, "num_input_tokens_seen": 10574088, "step": 18340 }, { "epoch": 2.732350312779267, "grad_norm": 0.2647096514701843, "learning_rate": 4.556749453143605e-05, "loss": 0.8266, "num_input_tokens_seen": 10576872, "step": 18345 }, { "epoch": 2.7330950253202264, "grad_norm": 0.28150564432144165, "learning_rate": 4.556379939970797e-05, "loss": 0.8012, "num_input_tokens_seen": 10579752, "step": 18350 }, { "epoch": 2.7338397378611856, "grad_norm": 0.23459866642951965, "learning_rate": 4.5560102878361935e-05, "loss": 0.7942, "num_input_tokens_seen": 10582408, "step": 18355 }, { "epoch": 2.734584450402145, "grad_norm": 0.244015172123909, "learning_rate": 4.5556404967647736e-05, "loss": 0.8086, "num_input_tokens_seen": 10585096, "step": 18360 }, { "epoch": 2.735329162943104, "grad_norm": 0.33359941840171814, "learning_rate": 4.5552705667815265e-05, "loss": 0.7887, "num_input_tokens_seen": 10587688, "step": 18365 }, { "epoch": 2.7360738754840632, "grad_norm": 0.295421302318573, "learning_rate": 4.5549004979114506e-05, "loss": 0.8064, "num_input_tokens_seen": 10590760, "step": 18370 }, { "epoch": 2.7368185880250224, "grad_norm": 0.28143611550331116, "learning_rate": 4.5545302901795536e-05, "loss": 0.7974, "num_input_tokens_seen": 10593544, "step": 18375 }, { "epoch": 2.7375633005659816, "grad_norm": 0.2762729525566101, "learning_rate": 4.5541599436108535e-05, "loss": 0.7893, "num_input_tokens_seen": 10596552, "step": 18380 }, { "epoch": 2.738308013106941, "grad_norm": 0.3567802608013153, "learning_rate": 4.553789458230375e-05, "loss": 0.7909, "num_input_tokens_seen": 10599432, "step": 18385 }, { "epoch": 2.7390527256479, "grad_norm": 0.3128993511199951, "learning_rate": 4.553418834063156e-05, "loss": 0.8035, "num_input_tokens_seen": 10602600, "step": 18390 }, { "epoch": 2.7397974381888592, "grad_norm": 0.2719098925590515, "learning_rate": 4.55304807113424e-05, "loss": 0.7871, "num_input_tokens_seen": 10605448, "step": 18395 }, { "epoch": 2.7405421507298184, "grad_norm": 0.3297634422779083, "learning_rate": 4.5526771694686835e-05, "loss": 0.8045, "num_input_tokens_seen": 10608296, "step": 18400 }, { "epoch": 2.7412868632707776, "grad_norm": 0.33256736397743225, "learning_rate": 4.552306129091548e-05, "loss": 0.7972, "num_input_tokens_seen": 10611272, "step": 18405 }, { "epoch": 2.742031575811737, "grad_norm": 0.33225932717323303, "learning_rate": 4.551934950027909e-05, "loss": 0.8554, "num_input_tokens_seen": 10614440, "step": 18410 }, { "epoch": 2.7427762883526956, "grad_norm": 0.267320841550827, "learning_rate": 4.551563632302849e-05, "loss": 0.7875, "num_input_tokens_seen": 10617192, "step": 18415 }, { "epoch": 2.7435210008936552, "grad_norm": 0.2777850925922394, "learning_rate": 4.551192175941459e-05, "loss": 0.818, "num_input_tokens_seen": 10619880, "step": 18420 }, { "epoch": 2.744265713434614, "grad_norm": 0.3223567306995392, "learning_rate": 4.550820580968842e-05, "loss": 0.8306, "num_input_tokens_seen": 10622536, "step": 18425 }, { "epoch": 2.7450104259755737, "grad_norm": 0.3243470788002014, "learning_rate": 4.550448847410108e-05, "loss": 0.8114, "num_input_tokens_seen": 10625864, "step": 18430 }, { "epoch": 2.7457551385165324, "grad_norm": 0.3352319002151489, "learning_rate": 4.550076975290377e-05, "loss": 0.7366, "num_input_tokens_seen": 10628776, "step": 18435 }, { "epoch": 2.746499851057492, "grad_norm": 0.430284708738327, "learning_rate": 4.549704964634779e-05, "loss": 0.8006, "num_input_tokens_seen": 10631496, "step": 18440 }, { "epoch": 2.747244563598451, "grad_norm": 0.398043692111969, "learning_rate": 4.549332815468453e-05, "loss": 0.8086, "num_input_tokens_seen": 10634600, "step": 18445 }, { "epoch": 2.7479892761394105, "grad_norm": 0.45892199873924255, "learning_rate": 4.5489605278165484e-05, "loss": 0.8299, "num_input_tokens_seen": 10637320, "step": 18450 }, { "epoch": 2.7487339886803692, "grad_norm": 0.34918081760406494, "learning_rate": 4.548588101704221e-05, "loss": 0.8056, "num_input_tokens_seen": 10640008, "step": 18455 }, { "epoch": 2.7494787012213284, "grad_norm": 0.35957807302474976, "learning_rate": 4.5482155371566384e-05, "loss": 0.7864, "num_input_tokens_seen": 10642856, "step": 18460 }, { "epoch": 2.7502234137622876, "grad_norm": 0.44838011264801025, "learning_rate": 4.5478428341989774e-05, "loss": 0.813, "num_input_tokens_seen": 10646024, "step": 18465 }, { "epoch": 2.750968126303247, "grad_norm": 0.3133252263069153, "learning_rate": 4.547469992856424e-05, "loss": 0.8128, "num_input_tokens_seen": 10649064, "step": 18470 }, { "epoch": 2.751712838844206, "grad_norm": 0.2508544921875, "learning_rate": 4.5470970131541727e-05, "loss": 0.817, "num_input_tokens_seen": 10651784, "step": 18475 }, { "epoch": 2.7524575513851652, "grad_norm": 0.33909133076667786, "learning_rate": 4.546723895117428e-05, "loss": 0.8413, "num_input_tokens_seen": 10654920, "step": 18480 }, { "epoch": 2.7532022639261244, "grad_norm": 0.3309532701969147, "learning_rate": 4.546350638771404e-05, "loss": 0.7911, "num_input_tokens_seen": 10657928, "step": 18485 }, { "epoch": 2.7539469764670836, "grad_norm": 0.25655797123908997, "learning_rate": 4.5459772441413234e-05, "loss": 0.7738, "num_input_tokens_seen": 10660808, "step": 18490 }, { "epoch": 2.754691689008043, "grad_norm": 0.28238001465797424, "learning_rate": 4.5456037112524195e-05, "loss": 0.8242, "num_input_tokens_seen": 10663592, "step": 18495 }, { "epoch": 2.755436401549002, "grad_norm": 0.26218822598457336, "learning_rate": 4.545230040129933e-05, "loss": 0.7802, "num_input_tokens_seen": 10666408, "step": 18500 }, { "epoch": 2.7561811140899612, "grad_norm": 0.30720055103302, "learning_rate": 4.544856230799116e-05, "loss": 0.8192, "num_input_tokens_seen": 10669096, "step": 18505 }, { "epoch": 2.7569258266309205, "grad_norm": 0.3106054365634918, "learning_rate": 4.544482283285228e-05, "loss": 0.8153, "num_input_tokens_seen": 10672008, "step": 18510 }, { "epoch": 2.7576705391718797, "grad_norm": 0.4170317053794861, "learning_rate": 4.54410819761354e-05, "loss": 0.831, "num_input_tokens_seen": 10674920, "step": 18515 }, { "epoch": 2.758415251712839, "grad_norm": 0.2317076325416565, "learning_rate": 4.5437339738093315e-05, "loss": 0.7932, "num_input_tokens_seen": 10677736, "step": 18520 }, { "epoch": 2.759159964253798, "grad_norm": 0.2706926167011261, "learning_rate": 4.54335961189789e-05, "loss": 0.8048, "num_input_tokens_seen": 10680488, "step": 18525 }, { "epoch": 2.7599046767947573, "grad_norm": 0.318889319896698, "learning_rate": 4.542985111904513e-05, "loss": 0.7879, "num_input_tokens_seen": 10683528, "step": 18530 }, { "epoch": 2.7606493893357165, "grad_norm": 0.31649789214134216, "learning_rate": 4.542610473854508e-05, "loss": 0.8248, "num_input_tokens_seen": 10686568, "step": 18535 }, { "epoch": 2.7613941018766757, "grad_norm": 0.3117066025733948, "learning_rate": 4.542235697773193e-05, "loss": 0.8094, "num_input_tokens_seen": 10689320, "step": 18540 }, { "epoch": 2.762138814417635, "grad_norm": 0.5166143178939819, "learning_rate": 4.5418607836858914e-05, "loss": 0.8185, "num_input_tokens_seen": 10692136, "step": 18545 }, { "epoch": 2.762883526958594, "grad_norm": 0.2574833035469055, "learning_rate": 4.5414857316179415e-05, "loss": 0.8008, "num_input_tokens_seen": 10695176, "step": 18550 }, { "epoch": 2.7636282394995533, "grad_norm": 0.3030836582183838, "learning_rate": 4.541110541594684e-05, "loss": 0.8168, "num_input_tokens_seen": 10697928, "step": 18555 }, { "epoch": 2.7643729520405125, "grad_norm": 0.3122755289077759, "learning_rate": 4.540735213641476e-05, "loss": 0.7922, "num_input_tokens_seen": 10700552, "step": 18560 }, { "epoch": 2.7651176645814717, "grad_norm": 0.21830546855926514, "learning_rate": 4.540359747783679e-05, "loss": 0.8226, "num_input_tokens_seen": 10703336, "step": 18565 }, { "epoch": 2.765862377122431, "grad_norm": 0.31035059690475464, "learning_rate": 4.539984144046665e-05, "loss": 0.8059, "num_input_tokens_seen": 10706024, "step": 18570 }, { "epoch": 2.76660708966339, "grad_norm": 0.26664063334465027, "learning_rate": 4.539608402455817e-05, "loss": 0.7994, "num_input_tokens_seen": 10709032, "step": 18575 }, { "epoch": 2.7673518022043493, "grad_norm": 0.35667839646339417, "learning_rate": 4.5392325230365264e-05, "loss": 0.8067, "num_input_tokens_seen": 10711976, "step": 18580 }, { "epoch": 2.7680965147453085, "grad_norm": 0.33779725432395935, "learning_rate": 4.538856505814191e-05, "loss": 0.794, "num_input_tokens_seen": 10714760, "step": 18585 }, { "epoch": 2.7688412272862672, "grad_norm": 0.32488390803337097, "learning_rate": 4.5384803508142235e-05, "loss": 0.8009, "num_input_tokens_seen": 10717704, "step": 18590 }, { "epoch": 2.769585939827227, "grad_norm": 0.3208564519882202, "learning_rate": 4.538104058062042e-05, "loss": 0.7862, "num_input_tokens_seen": 10720552, "step": 18595 }, { "epoch": 2.7703306523681857, "grad_norm": 0.3470397889614105, "learning_rate": 4.537727627583074e-05, "loss": 0.8112, "num_input_tokens_seen": 10723752, "step": 18600 }, { "epoch": 2.7710753649091453, "grad_norm": 0.2528504729270935, "learning_rate": 4.5373510594027576e-05, "loss": 0.8315, "num_input_tokens_seen": 10726440, "step": 18605 }, { "epoch": 2.771820077450104, "grad_norm": 0.2942143380641937, "learning_rate": 4.53697435354654e-05, "loss": 0.8237, "num_input_tokens_seen": 10728904, "step": 18610 }, { "epoch": 2.7725647899910637, "grad_norm": 0.34954121708869934, "learning_rate": 4.536597510039878e-05, "loss": 0.8054, "num_input_tokens_seen": 10731560, "step": 18615 }, { "epoch": 2.7733095025320225, "grad_norm": 0.33220234513282776, "learning_rate": 4.536220528908236e-05, "loss": 0.7997, "num_input_tokens_seen": 10734312, "step": 18620 }, { "epoch": 2.7740542150729817, "grad_norm": 0.2798140347003937, "learning_rate": 4.535843410177089e-05, "loss": 0.8224, "num_input_tokens_seen": 10737288, "step": 18625 }, { "epoch": 2.774798927613941, "grad_norm": 0.27085545659065247, "learning_rate": 4.5354661538719224e-05, "loss": 0.8203, "num_input_tokens_seen": 10740200, "step": 18630 }, { "epoch": 2.7755436401549, "grad_norm": 0.22590364515781403, "learning_rate": 4.5350887600182275e-05, "loss": 0.813, "num_input_tokens_seen": 10742984, "step": 18635 }, { "epoch": 2.7762883526958593, "grad_norm": 0.4372204542160034, "learning_rate": 4.534711228641509e-05, "loss": 0.8266, "num_input_tokens_seen": 10745704, "step": 18640 }, { "epoch": 2.7770330652368185, "grad_norm": 0.21612006425857544, "learning_rate": 4.5343335597672776e-05, "loss": 0.8009, "num_input_tokens_seen": 10748264, "step": 18645 }, { "epoch": 2.7777777777777777, "grad_norm": 0.4448430836200714, "learning_rate": 4.5339557534210565e-05, "loss": 0.8146, "num_input_tokens_seen": 10751272, "step": 18650 }, { "epoch": 2.778522490318737, "grad_norm": 0.24853524565696716, "learning_rate": 4.533577809628374e-05, "loss": 0.8085, "num_input_tokens_seen": 10754280, "step": 18655 }, { "epoch": 2.779267202859696, "grad_norm": 0.26166054606437683, "learning_rate": 4.533199728414771e-05, "loss": 0.8255, "num_input_tokens_seen": 10757384, "step": 18660 }, { "epoch": 2.7800119154006553, "grad_norm": 0.2765321135520935, "learning_rate": 4.532821509805797e-05, "loss": 0.796, "num_input_tokens_seen": 10760296, "step": 18665 }, { "epoch": 2.7807566279416145, "grad_norm": 0.26674020290374756, "learning_rate": 4.53244315382701e-05, "loss": 0.8039, "num_input_tokens_seen": 10763112, "step": 18670 }, { "epoch": 2.7815013404825737, "grad_norm": 0.36210373044013977, "learning_rate": 4.532064660503978e-05, "loss": 0.8282, "num_input_tokens_seen": 10765928, "step": 18675 }, { "epoch": 2.782246053023533, "grad_norm": 0.29144737124443054, "learning_rate": 4.531686029862279e-05, "loss": 0.8185, "num_input_tokens_seen": 10768680, "step": 18680 }, { "epoch": 2.782990765564492, "grad_norm": 0.356106698513031, "learning_rate": 4.531307261927497e-05, "loss": 0.8225, "num_input_tokens_seen": 10771720, "step": 18685 }, { "epoch": 2.7837354781054513, "grad_norm": 0.2386067658662796, "learning_rate": 4.530928356725229e-05, "loss": 0.8038, "num_input_tokens_seen": 10774216, "step": 18690 }, { "epoch": 2.7844801906464105, "grad_norm": 0.3804270327091217, "learning_rate": 4.530549314281081e-05, "loss": 0.7762, "num_input_tokens_seen": 10777000, "step": 18695 }, { "epoch": 2.7852249031873697, "grad_norm": 0.3703525960445404, "learning_rate": 4.530170134620665e-05, "loss": 0.8165, "num_input_tokens_seen": 10779656, "step": 18700 }, { "epoch": 2.785969615728329, "grad_norm": 0.3155137598514557, "learning_rate": 4.529790817769606e-05, "loss": 0.8167, "num_input_tokens_seen": 10782504, "step": 18705 }, { "epoch": 2.786714328269288, "grad_norm": 0.3068779408931732, "learning_rate": 4.529411363753535e-05, "loss": 0.7602, "num_input_tokens_seen": 10785320, "step": 18710 }, { "epoch": 2.7874590408102473, "grad_norm": 0.3490031957626343, "learning_rate": 4.5290317725980964e-05, "loss": 0.8278, "num_input_tokens_seen": 10788328, "step": 18715 }, { "epoch": 2.7882037533512065, "grad_norm": 0.28075969219207764, "learning_rate": 4.5286520443289396e-05, "loss": 0.784, "num_input_tokens_seen": 10790888, "step": 18720 }, { "epoch": 2.7889484658921657, "grad_norm": 0.30286356806755066, "learning_rate": 4.528272178971725e-05, "loss": 0.8073, "num_input_tokens_seen": 10793960, "step": 18725 }, { "epoch": 2.789693178433125, "grad_norm": 0.28225505352020264, "learning_rate": 4.5278921765521234e-05, "loss": 0.7961, "num_input_tokens_seen": 10796872, "step": 18730 }, { "epoch": 2.790437890974084, "grad_norm": 0.3594607412815094, "learning_rate": 4.5275120370958124e-05, "loss": 0.8233, "num_input_tokens_seen": 10799880, "step": 18735 }, { "epoch": 2.7911826035150433, "grad_norm": 0.3016793727874756, "learning_rate": 4.5271317606284826e-05, "loss": 0.8231, "num_input_tokens_seen": 10802568, "step": 18740 }, { "epoch": 2.7919273160560025, "grad_norm": 0.2803948223590851, "learning_rate": 4.5267513471758295e-05, "loss": 0.8016, "num_input_tokens_seen": 10805480, "step": 18745 }, { "epoch": 2.7926720285969617, "grad_norm": 0.3730054795742035, "learning_rate": 4.5263707967635596e-05, "loss": 0.8283, "num_input_tokens_seen": 10808680, "step": 18750 }, { "epoch": 2.7934167411379205, "grad_norm": 0.2943832576274872, "learning_rate": 4.5259901094173915e-05, "loss": 0.8147, "num_input_tokens_seen": 10811720, "step": 18755 }, { "epoch": 2.79416145367888, "grad_norm": 0.3107406795024872, "learning_rate": 4.525609285163048e-05, "loss": 0.7816, "num_input_tokens_seen": 10814568, "step": 18760 }, { "epoch": 2.794906166219839, "grad_norm": 0.3490141034126282, "learning_rate": 4.525228324026265e-05, "loss": 0.7805, "num_input_tokens_seen": 10817448, "step": 18765 }, { "epoch": 2.7956508787607985, "grad_norm": 0.2322356402873993, "learning_rate": 4.5248472260327854e-05, "loss": 0.8558, "num_input_tokens_seen": 10820584, "step": 18770 }, { "epoch": 2.7963955913017573, "grad_norm": 0.19675926864147186, "learning_rate": 4.5244659912083626e-05, "loss": 0.7527, "num_input_tokens_seen": 10823400, "step": 18775 }, { "epoch": 2.797140303842717, "grad_norm": 0.3199714422225952, "learning_rate": 4.524084619578759e-05, "loss": 0.8296, "num_input_tokens_seen": 10826024, "step": 18780 }, { "epoch": 2.7978850163836757, "grad_norm": 0.2504366934299469, "learning_rate": 4.523703111169746e-05, "loss": 0.8021, "num_input_tokens_seen": 10829000, "step": 18785 }, { "epoch": 2.7986297289246354, "grad_norm": 0.2632543444633484, "learning_rate": 4.5233214660071055e-05, "loss": 0.7968, "num_input_tokens_seen": 10831912, "step": 18790 }, { "epoch": 2.799374441465594, "grad_norm": 0.3491356670856476, "learning_rate": 4.522939684116626e-05, "loss": 0.8343, "num_input_tokens_seen": 10835048, "step": 18795 }, { "epoch": 2.8001191540065533, "grad_norm": 0.4534246027469635, "learning_rate": 4.522557765524107e-05, "loss": 0.8324, "num_input_tokens_seen": 10837992, "step": 18800 }, { "epoch": 2.8008638665475125, "grad_norm": 0.4709221124649048, "learning_rate": 4.5221757102553576e-05, "loss": 0.7858, "num_input_tokens_seen": 10840808, "step": 18805 }, { "epoch": 2.8016085790884717, "grad_norm": 0.302566260099411, "learning_rate": 4.521793518336195e-05, "loss": 0.7981, "num_input_tokens_seen": 10843720, "step": 18810 }, { "epoch": 2.802353291629431, "grad_norm": 0.35815075039863586, "learning_rate": 4.521411189792447e-05, "loss": 0.8012, "num_input_tokens_seen": 10846920, "step": 18815 }, { "epoch": 2.80309800417039, "grad_norm": 0.27092233300209045, "learning_rate": 4.521028724649949e-05, "loss": 0.8212, "num_input_tokens_seen": 10849832, "step": 18820 }, { "epoch": 2.8038427167113493, "grad_norm": 0.2522723078727722, "learning_rate": 4.520646122934547e-05, "loss": 0.7964, "num_input_tokens_seen": 10852584, "step": 18825 }, { "epoch": 2.8045874292523085, "grad_norm": 0.26674363017082214, "learning_rate": 4.5202633846720944e-05, "loss": 0.8047, "num_input_tokens_seen": 10855624, "step": 18830 }, { "epoch": 2.8053321417932677, "grad_norm": 0.2664240002632141, "learning_rate": 4.519880509888457e-05, "loss": 0.793, "num_input_tokens_seen": 10858632, "step": 18835 }, { "epoch": 2.806076854334227, "grad_norm": 0.31973257660865784, "learning_rate": 4.519497498609506e-05, "loss": 0.8064, "num_input_tokens_seen": 10861544, "step": 18840 }, { "epoch": 2.806821566875186, "grad_norm": 0.20685185492038727, "learning_rate": 4.519114350861125e-05, "loss": 0.79, "num_input_tokens_seen": 10864392, "step": 18845 }, { "epoch": 2.8075662794161453, "grad_norm": 0.34822729229927063, "learning_rate": 4.5187310666692065e-05, "loss": 0.8319, "num_input_tokens_seen": 10867176, "step": 18850 }, { "epoch": 2.8083109919571045, "grad_norm": 0.24161618947982788, "learning_rate": 4.5183476460596486e-05, "loss": 0.8213, "num_input_tokens_seen": 10869992, "step": 18855 }, { "epoch": 2.8090557044980637, "grad_norm": 0.28919824957847595, "learning_rate": 4.5179640890583634e-05, "loss": 0.8108, "num_input_tokens_seen": 10873096, "step": 18860 }, { "epoch": 2.809800417039023, "grad_norm": 0.2585037350654602, "learning_rate": 4.51758039569127e-05, "loss": 0.8382, "num_input_tokens_seen": 10875720, "step": 18865 }, { "epoch": 2.810545129579982, "grad_norm": 0.22919251024723053, "learning_rate": 4.517196565984296e-05, "loss": 0.8164, "num_input_tokens_seen": 10878728, "step": 18870 }, { "epoch": 2.8112898421209414, "grad_norm": 0.2599160075187683, "learning_rate": 4.5168125999633794e-05, "loss": 0.7672, "num_input_tokens_seen": 10881480, "step": 18875 }, { "epoch": 2.8120345546619006, "grad_norm": 0.2359396517276764, "learning_rate": 4.5164284976544664e-05, "loss": 0.8542, "num_input_tokens_seen": 10883976, "step": 18880 }, { "epoch": 2.8127792672028598, "grad_norm": 0.3073355555534363, "learning_rate": 4.516044259083514e-05, "loss": 0.7895, "num_input_tokens_seen": 10886728, "step": 18885 }, { "epoch": 2.813523979743819, "grad_norm": 0.24187706410884857, "learning_rate": 4.515659884276487e-05, "loss": 0.8292, "num_input_tokens_seen": 10889608, "step": 18890 }, { "epoch": 2.814268692284778, "grad_norm": 0.2998482584953308, "learning_rate": 4.515275373259361e-05, "loss": 0.8087, "num_input_tokens_seen": 10892328, "step": 18895 }, { "epoch": 2.8150134048257374, "grad_norm": 0.2792421877384186, "learning_rate": 4.5148907260581185e-05, "loss": 0.803, "num_input_tokens_seen": 10895304, "step": 18900 }, { "epoch": 2.8157581173666966, "grad_norm": 0.32374411821365356, "learning_rate": 4.5145059426987523e-05, "loss": 0.8068, "num_input_tokens_seen": 10898184, "step": 18905 }, { "epoch": 2.8165028299076558, "grad_norm": 0.3168492913246155, "learning_rate": 4.514121023207265e-05, "loss": 0.8147, "num_input_tokens_seen": 10900968, "step": 18910 }, { "epoch": 2.817247542448615, "grad_norm": 0.3216346204280853, "learning_rate": 4.513735967609668e-05, "loss": 0.8123, "num_input_tokens_seen": 10903624, "step": 18915 }, { "epoch": 2.817992254989574, "grad_norm": 0.25333940982818604, "learning_rate": 4.5133507759319816e-05, "loss": 0.8189, "num_input_tokens_seen": 10906696, "step": 18920 }, { "epoch": 2.8187369675305334, "grad_norm": 0.3091747462749481, "learning_rate": 4.512965448200235e-05, "loss": 0.7931, "num_input_tokens_seen": 10909320, "step": 18925 }, { "epoch": 2.819481680071492, "grad_norm": 0.2940203547477722, "learning_rate": 4.5125799844404683e-05, "loss": 0.8405, "num_input_tokens_seen": 10912008, "step": 18930 }, { "epoch": 2.820226392612452, "grad_norm": 0.34863215684890747, "learning_rate": 4.512194384678728e-05, "loss": 0.8308, "num_input_tokens_seen": 10914920, "step": 18935 }, { "epoch": 2.8209711051534105, "grad_norm": 0.2811158299446106, "learning_rate": 4.511808648941073e-05, "loss": 0.7877, "num_input_tokens_seen": 10917640, "step": 18940 }, { "epoch": 2.82171581769437, "grad_norm": 0.29816892743110657, "learning_rate": 4.511422777253568e-05, "loss": 0.8436, "num_input_tokens_seen": 10920520, "step": 18945 }, { "epoch": 2.822460530235329, "grad_norm": 0.2354561984539032, "learning_rate": 4.51103676964229e-05, "loss": 0.8179, "num_input_tokens_seen": 10923400, "step": 18950 }, { "epoch": 2.8232052427762886, "grad_norm": 0.25792860984802246, "learning_rate": 4.5106506261333234e-05, "loss": 0.8636, "num_input_tokens_seen": 10926312, "step": 18955 }, { "epoch": 2.8239499553172474, "grad_norm": 0.2898171842098236, "learning_rate": 4.5102643467527616e-05, "loss": 0.809, "num_input_tokens_seen": 10929096, "step": 18960 }, { "epoch": 2.824694667858207, "grad_norm": 0.3094172775745392, "learning_rate": 4.509877931526709e-05, "loss": 0.8225, "num_input_tokens_seen": 10931656, "step": 18965 }, { "epoch": 2.8254393803991658, "grad_norm": 0.3771180510520935, "learning_rate": 4.5094913804812776e-05, "loss": 0.8081, "num_input_tokens_seen": 10934408, "step": 18970 }, { "epoch": 2.826184092940125, "grad_norm": 0.2070309817790985, "learning_rate": 4.509104693642588e-05, "loss": 0.7878, "num_input_tokens_seen": 10937160, "step": 18975 }, { "epoch": 2.826928805481084, "grad_norm": 0.3644925057888031, "learning_rate": 4.508717871036772e-05, "loss": 0.8153, "num_input_tokens_seen": 10940136, "step": 18980 }, { "epoch": 2.8276735180220434, "grad_norm": 0.2968228757381439, "learning_rate": 4.508330912689969e-05, "loss": 0.8068, "num_input_tokens_seen": 10943080, "step": 18985 }, { "epoch": 2.8284182305630026, "grad_norm": 0.24420495331287384, "learning_rate": 4.5079438186283285e-05, "loss": 0.8041, "num_input_tokens_seen": 10945640, "step": 18990 }, { "epoch": 2.8291629431039618, "grad_norm": 0.25352728366851807, "learning_rate": 4.507556588878009e-05, "loss": 0.8062, "num_input_tokens_seen": 10948456, "step": 18995 }, { "epoch": 2.829907655644921, "grad_norm": 0.2512144446372986, "learning_rate": 4.5071692234651764e-05, "loss": 0.8052, "num_input_tokens_seen": 10951336, "step": 19000 }, { "epoch": 2.83065236818588, "grad_norm": 0.277935653924942, "learning_rate": 4.506781722416008e-05, "loss": 0.7989, "num_input_tokens_seen": 10954312, "step": 19005 }, { "epoch": 2.8313970807268394, "grad_norm": 0.47249191999435425, "learning_rate": 4.5063940857566896e-05, "loss": 0.8289, "num_input_tokens_seen": 10957192, "step": 19010 }, { "epoch": 2.8321417932677986, "grad_norm": 0.349700003862381, "learning_rate": 4.506006313513418e-05, "loss": 0.7981, "num_input_tokens_seen": 10959720, "step": 19015 }, { "epoch": 2.832886505808758, "grad_norm": 0.3195176124572754, "learning_rate": 4.505618405712394e-05, "loss": 0.7898, "num_input_tokens_seen": 10962696, "step": 19020 }, { "epoch": 2.833631218349717, "grad_norm": 0.3584403097629547, "learning_rate": 4.505230362379833e-05, "loss": 0.8133, "num_input_tokens_seen": 10965672, "step": 19025 }, { "epoch": 2.834375930890676, "grad_norm": 0.26753440499305725, "learning_rate": 4.504842183541956e-05, "loss": 0.8187, "num_input_tokens_seen": 10968616, "step": 19030 }, { "epoch": 2.8351206434316354, "grad_norm": 0.38649970293045044, "learning_rate": 4.5044538692249964e-05, "loss": 0.8175, "num_input_tokens_seen": 10971592, "step": 19035 }, { "epoch": 2.8358653559725946, "grad_norm": 0.28558051586151123, "learning_rate": 4.504065419455193e-05, "loss": 0.7984, "num_input_tokens_seen": 10974696, "step": 19040 }, { "epoch": 2.836610068513554, "grad_norm": 0.3551676869392395, "learning_rate": 4.503676834258798e-05, "loss": 0.7911, "num_input_tokens_seen": 10977384, "step": 19045 }, { "epoch": 2.837354781054513, "grad_norm": 0.23325015604496002, "learning_rate": 4.503288113662068e-05, "loss": 0.8079, "num_input_tokens_seen": 10980264, "step": 19050 }, { "epoch": 2.838099493595472, "grad_norm": 0.39890599250793457, "learning_rate": 4.5028992576912714e-05, "loss": 0.8082, "num_input_tokens_seen": 10983112, "step": 19055 }, { "epoch": 2.8388442061364314, "grad_norm": 0.41649195551872253, "learning_rate": 4.502510266372687e-05, "loss": 0.8328, "num_input_tokens_seen": 10985960, "step": 19060 }, { "epoch": 2.8395889186773906, "grad_norm": 0.2827758491039276, "learning_rate": 4.5021211397326e-05, "loss": 0.7998, "num_input_tokens_seen": 10988968, "step": 19065 }, { "epoch": 2.84033363121835, "grad_norm": 0.223593607544899, "learning_rate": 4.501731877797306e-05, "loss": 0.7722, "num_input_tokens_seen": 10991880, "step": 19070 }, { "epoch": 2.841078343759309, "grad_norm": 0.21719181537628174, "learning_rate": 4.5013424805931104e-05, "loss": 0.7885, "num_input_tokens_seen": 10994632, "step": 19075 }, { "epoch": 2.841823056300268, "grad_norm": 0.37540531158447266, "learning_rate": 4.5009529481463274e-05, "loss": 0.8544, "num_input_tokens_seen": 10997384, "step": 19080 }, { "epoch": 2.8425677688412274, "grad_norm": 0.29330331087112427, "learning_rate": 4.5005632804832786e-05, "loss": 0.792, "num_input_tokens_seen": 11000168, "step": 19085 }, { "epoch": 2.8433124813821866, "grad_norm": 0.24686895310878754, "learning_rate": 4.500173477630298e-05, "loss": 0.8117, "num_input_tokens_seen": 11003144, "step": 19090 }, { "epoch": 2.844057193923146, "grad_norm": 0.24901795387268066, "learning_rate": 4.499783539613726e-05, "loss": 0.8314, "num_input_tokens_seen": 11006376, "step": 19095 }, { "epoch": 2.844801906464105, "grad_norm": 0.30633431673049927, "learning_rate": 4.4993934664599116e-05, "loss": 0.7907, "num_input_tokens_seen": 11009288, "step": 19100 }, { "epoch": 2.845546619005064, "grad_norm": 0.3341327905654907, "learning_rate": 4.4990032581952166e-05, "loss": 0.8373, "num_input_tokens_seen": 11012232, "step": 19105 }, { "epoch": 2.8462913315460234, "grad_norm": 0.36929264664649963, "learning_rate": 4.498612914846008e-05, "loss": 0.7921, "num_input_tokens_seen": 11015144, "step": 19110 }, { "epoch": 2.847036044086982, "grad_norm": 0.37262973189353943, "learning_rate": 4.498222436438665e-05, "loss": 0.7858, "num_input_tokens_seen": 11018152, "step": 19115 }, { "epoch": 2.847780756627942, "grad_norm": 0.40802058577537537, "learning_rate": 4.497831822999574e-05, "loss": 0.7988, "num_input_tokens_seen": 11021096, "step": 19120 }, { "epoch": 2.8485254691689006, "grad_norm": 0.29873204231262207, "learning_rate": 4.497441074555131e-05, "loss": 0.8103, "num_input_tokens_seen": 11023944, "step": 19125 }, { "epoch": 2.8492701817098602, "grad_norm": 0.30047541856765747, "learning_rate": 4.497050191131741e-05, "loss": 0.7867, "num_input_tokens_seen": 11026696, "step": 19130 }, { "epoch": 2.850014894250819, "grad_norm": 0.4525800347328186, "learning_rate": 4.4966591727558184e-05, "loss": 0.824, "num_input_tokens_seen": 11029864, "step": 19135 }, { "epoch": 2.8507596067917786, "grad_norm": 0.2797855734825134, "learning_rate": 4.496268019453787e-05, "loss": 0.8104, "num_input_tokens_seen": 11033256, "step": 19140 }, { "epoch": 2.8515043193327374, "grad_norm": 0.320536732673645, "learning_rate": 4.495876731252079e-05, "loss": 0.8004, "num_input_tokens_seen": 11036392, "step": 19145 }, { "epoch": 2.8522490318736966, "grad_norm": 0.2697421610355377, "learning_rate": 4.495485308177136e-05, "loss": 0.7806, "num_input_tokens_seen": 11039176, "step": 19150 }, { "epoch": 2.852993744414656, "grad_norm": 0.3258861303329468, "learning_rate": 4.495093750255409e-05, "loss": 0.7744, "num_input_tokens_seen": 11042280, "step": 19155 }, { "epoch": 2.853738456955615, "grad_norm": 0.31225016713142395, "learning_rate": 4.494702057513358e-05, "loss": 0.8403, "num_input_tokens_seen": 11045000, "step": 19160 }, { "epoch": 2.854483169496574, "grad_norm": 0.361853688955307, "learning_rate": 4.4943102299774513e-05, "loss": 0.8367, "num_input_tokens_seen": 11048200, "step": 19165 }, { "epoch": 2.8552278820375334, "grad_norm": 0.35456550121307373, "learning_rate": 4.493918267674168e-05, "loss": 0.7955, "num_input_tokens_seen": 11051016, "step": 19170 }, { "epoch": 2.8559725945784926, "grad_norm": 0.356932669878006, "learning_rate": 4.4935261706299944e-05, "loss": 0.7892, "num_input_tokens_seen": 11053928, "step": 19175 }, { "epoch": 2.856717307119452, "grad_norm": 0.34663254022598267, "learning_rate": 4.4931339388714276e-05, "loss": 0.7901, "num_input_tokens_seen": 11056744, "step": 19180 }, { "epoch": 2.857462019660411, "grad_norm": 0.25137874484062195, "learning_rate": 4.4927415724249735e-05, "loss": 0.8241, "num_input_tokens_seen": 11059368, "step": 19185 }, { "epoch": 2.8582067322013702, "grad_norm": 0.32876038551330566, "learning_rate": 4.492349071317145e-05, "loss": 0.8035, "num_input_tokens_seen": 11062088, "step": 19190 }, { "epoch": 2.8589514447423294, "grad_norm": 0.22733552753925323, "learning_rate": 4.491956435574466e-05, "loss": 0.7972, "num_input_tokens_seen": 11064936, "step": 19195 }, { "epoch": 2.8596961572832886, "grad_norm": 0.2929706275463104, "learning_rate": 4.491563665223471e-05, "loss": 0.7856, "num_input_tokens_seen": 11067624, "step": 19200 }, { "epoch": 2.860440869824248, "grad_norm": 0.24549296498298645, "learning_rate": 4.491170760290699e-05, "loss": 0.7765, "num_input_tokens_seen": 11070600, "step": 19205 }, { "epoch": 2.861185582365207, "grad_norm": 0.23919369280338287, "learning_rate": 4.4907777208027044e-05, "loss": 0.8107, "num_input_tokens_seen": 11073608, "step": 19210 }, { "epoch": 2.8619302949061662, "grad_norm": 0.3175407350063324, "learning_rate": 4.490384546786044e-05, "loss": 0.795, "num_input_tokens_seen": 11076328, "step": 19215 }, { "epoch": 2.8626750074471254, "grad_norm": 0.3728577792644501, "learning_rate": 4.489991238267289e-05, "loss": 0.8113, "num_input_tokens_seen": 11078984, "step": 19220 }, { "epoch": 2.8634197199880846, "grad_norm": 0.2261979877948761, "learning_rate": 4.489597795273016e-05, "loss": 0.8231, "num_input_tokens_seen": 11081800, "step": 19225 }, { "epoch": 2.864164432529044, "grad_norm": 0.3422435224056244, "learning_rate": 4.4892042178298136e-05, "loss": 0.8067, "num_input_tokens_seen": 11084936, "step": 19230 }, { "epoch": 2.864909145070003, "grad_norm": 0.3101840317249298, "learning_rate": 4.488810505964278e-05, "loss": 0.8284, "num_input_tokens_seen": 11087848, "step": 19235 }, { "epoch": 2.8656538576109623, "grad_norm": 0.2651522159576416, "learning_rate": 4.488416659703014e-05, "loss": 0.8179, "num_input_tokens_seen": 11090632, "step": 19240 }, { "epoch": 2.8663985701519215, "grad_norm": 0.3972131609916687, "learning_rate": 4.4880226790726366e-05, "loss": 0.7935, "num_input_tokens_seen": 11093640, "step": 19245 }, { "epoch": 2.8671432826928807, "grad_norm": 0.33780211210250854, "learning_rate": 4.4876285640997694e-05, "loss": 0.8045, "num_input_tokens_seen": 11096168, "step": 19250 }, { "epoch": 2.86788799523384, "grad_norm": 0.27667802572250366, "learning_rate": 4.487234314811044e-05, "loss": 0.8031, "num_input_tokens_seen": 11098920, "step": 19255 }, { "epoch": 2.868632707774799, "grad_norm": 0.3396700620651245, "learning_rate": 4.486839931233104e-05, "loss": 0.8145, "num_input_tokens_seen": 11101896, "step": 19260 }, { "epoch": 2.8693774203157583, "grad_norm": 0.26279330253601074, "learning_rate": 4.486445413392599e-05, "loss": 0.7847, "num_input_tokens_seen": 11104936, "step": 19265 }, { "epoch": 2.8701221328567175, "grad_norm": 0.28288522362709045, "learning_rate": 4.48605076131619e-05, "loss": 0.7994, "num_input_tokens_seen": 11107816, "step": 19270 }, { "epoch": 2.8708668453976767, "grad_norm": 0.4136272966861725, "learning_rate": 4.485655975030545e-05, "loss": 0.7983, "num_input_tokens_seen": 11110440, "step": 19275 }, { "epoch": 2.8716115579386354, "grad_norm": 0.28689706325531006, "learning_rate": 4.485261054562342e-05, "loss": 0.8362, "num_input_tokens_seen": 11113288, "step": 19280 }, { "epoch": 2.872356270479595, "grad_norm": 0.33925795555114746, "learning_rate": 4.484865999938268e-05, "loss": 0.8136, "num_input_tokens_seen": 11116008, "step": 19285 }, { "epoch": 2.873100983020554, "grad_norm": 0.29199928045272827, "learning_rate": 4.4844708111850195e-05, "loss": 0.8176, "num_input_tokens_seen": 11119016, "step": 19290 }, { "epoch": 2.8738456955615135, "grad_norm": 0.2762201130390167, "learning_rate": 4.4840754883293025e-05, "loss": 0.806, "num_input_tokens_seen": 11121768, "step": 19295 }, { "epoch": 2.8745904081024722, "grad_norm": 0.43748635053634644, "learning_rate": 4.483680031397831e-05, "loss": 0.7909, "num_input_tokens_seen": 11124552, "step": 19300 }, { "epoch": 2.875335120643432, "grad_norm": 0.2538279592990875, "learning_rate": 4.4832844404173266e-05, "loss": 0.8397, "num_input_tokens_seen": 11127464, "step": 19305 }, { "epoch": 2.8760798331843906, "grad_norm": 0.25116047263145447, "learning_rate": 4.482888715414525e-05, "loss": 0.7872, "num_input_tokens_seen": 11130472, "step": 19310 }, { "epoch": 2.8768245457253503, "grad_norm": 0.3277878761291504, "learning_rate": 4.482492856416165e-05, "loss": 0.8143, "num_input_tokens_seen": 11133064, "step": 19315 }, { "epoch": 2.877569258266309, "grad_norm": 0.307790070772171, "learning_rate": 4.482096863448998e-05, "loss": 0.8077, "num_input_tokens_seen": 11136136, "step": 19320 }, { "epoch": 2.8783139708072683, "grad_norm": 0.2754441499710083, "learning_rate": 4.481700736539784e-05, "loss": 0.7859, "num_input_tokens_seen": 11138984, "step": 19325 }, { "epoch": 2.8790586833482275, "grad_norm": 0.33219656348228455, "learning_rate": 4.48130447571529e-05, "loss": 0.7717, "num_input_tokens_seen": 11141992, "step": 19330 }, { "epoch": 2.8798033958891867, "grad_norm": 0.33637166023254395, "learning_rate": 4.480908081002296e-05, "loss": 0.821, "num_input_tokens_seen": 11144872, "step": 19335 }, { "epoch": 2.880548108430146, "grad_norm": 0.25142747163772583, "learning_rate": 4.480511552427587e-05, "loss": 0.8086, "num_input_tokens_seen": 11147848, "step": 19340 }, { "epoch": 2.881292820971105, "grad_norm": 0.22383087873458862, "learning_rate": 4.48011489001796e-05, "loss": 0.7984, "num_input_tokens_seen": 11150536, "step": 19345 }, { "epoch": 2.8820375335120643, "grad_norm": 0.22988775372505188, "learning_rate": 4.479718093800219e-05, "loss": 0.8458, "num_input_tokens_seen": 11153448, "step": 19350 }, { "epoch": 2.8827822460530235, "grad_norm": 0.25627607107162476, "learning_rate": 4.4793211638011786e-05, "loss": 0.8026, "num_input_tokens_seen": 11156040, "step": 19355 }, { "epoch": 2.8835269585939827, "grad_norm": 0.37725967168807983, "learning_rate": 4.478924100047661e-05, "loss": 0.8167, "num_input_tokens_seen": 11159112, "step": 19360 }, { "epoch": 2.884271671134942, "grad_norm": 0.32684677839279175, "learning_rate": 4.478526902566498e-05, "loss": 0.7888, "num_input_tokens_seen": 11162088, "step": 19365 }, { "epoch": 2.885016383675901, "grad_norm": 0.2652234435081482, "learning_rate": 4.4781295713845314e-05, "loss": 0.8322, "num_input_tokens_seen": 11164936, "step": 19370 }, { "epoch": 2.8857610962168603, "grad_norm": 0.28504279255867004, "learning_rate": 4.477732106528611e-05, "loss": 0.8045, "num_input_tokens_seen": 11167784, "step": 19375 }, { "epoch": 2.8865058087578195, "grad_norm": 0.2991313636302948, "learning_rate": 4.4773345080255955e-05, "loss": 0.7735, "num_input_tokens_seen": 11171144, "step": 19380 }, { "epoch": 2.8872505212987787, "grad_norm": 0.3626883625984192, "learning_rate": 4.4769367759023536e-05, "loss": 0.8261, "num_input_tokens_seen": 11174312, "step": 19385 }, { "epoch": 2.887995233839738, "grad_norm": 0.4247279763221741, "learning_rate": 4.4765389101857616e-05, "loss": 0.8123, "num_input_tokens_seen": 11177704, "step": 19390 }, { "epoch": 2.888739946380697, "grad_norm": 0.44860559701919556, "learning_rate": 4.4761409109027065e-05, "loss": 0.8254, "num_input_tokens_seen": 11180808, "step": 19395 }, { "epoch": 2.8894846589216563, "grad_norm": 0.26391494274139404, "learning_rate": 4.4757427780800825e-05, "loss": 0.792, "num_input_tokens_seen": 11183656, "step": 19400 }, { "epoch": 2.8902293714626155, "grad_norm": 0.3081252872943878, "learning_rate": 4.475344511744794e-05, "loss": 0.822, "num_input_tokens_seen": 11186568, "step": 19405 }, { "epoch": 2.8909740840035747, "grad_norm": 0.30576449632644653, "learning_rate": 4.4749461119237555e-05, "loss": 0.7858, "num_input_tokens_seen": 11189224, "step": 19410 }, { "epoch": 2.891718796544534, "grad_norm": 0.3420686423778534, "learning_rate": 4.4745475786438886e-05, "loss": 0.8229, "num_input_tokens_seen": 11192072, "step": 19415 }, { "epoch": 2.892463509085493, "grad_norm": 0.33802762627601624, "learning_rate": 4.4741489119321235e-05, "loss": 0.7778, "num_input_tokens_seen": 11195304, "step": 19420 }, { "epoch": 2.8932082216264523, "grad_norm": 0.3218270540237427, "learning_rate": 4.4737501118154014e-05, "loss": 0.8071, "num_input_tokens_seen": 11198248, "step": 19425 }, { "epoch": 2.8939529341674115, "grad_norm": 0.3149230480194092, "learning_rate": 4.473351178320671e-05, "loss": 0.8155, "num_input_tokens_seen": 11200872, "step": 19430 }, { "epoch": 2.8946976467083707, "grad_norm": 0.2800140976905823, "learning_rate": 4.472952111474892e-05, "loss": 0.7892, "num_input_tokens_seen": 11203688, "step": 19435 }, { "epoch": 2.89544235924933, "grad_norm": 0.27006664872169495, "learning_rate": 4.47255291130503e-05, "loss": 0.787, "num_input_tokens_seen": 11206728, "step": 19440 }, { "epoch": 2.896187071790289, "grad_norm": 0.24017076194286346, "learning_rate": 4.472153577838062e-05, "loss": 0.84, "num_input_tokens_seen": 11209544, "step": 19445 }, { "epoch": 2.8969317843312483, "grad_norm": 0.26272618770599365, "learning_rate": 4.471754111100974e-05, "loss": 0.8055, "num_input_tokens_seen": 11212328, "step": 19450 }, { "epoch": 2.897676496872207, "grad_norm": 0.2406204342842102, "learning_rate": 4.471354511120759e-05, "loss": 0.8235, "num_input_tokens_seen": 11215016, "step": 19455 }, { "epoch": 2.8984212094131667, "grad_norm": 0.21676105260849, "learning_rate": 4.470954777924421e-05, "loss": 0.806, "num_input_tokens_seen": 11217736, "step": 19460 }, { "epoch": 2.8991659219541255, "grad_norm": 0.2605660557746887, "learning_rate": 4.4705549115389735e-05, "loss": 0.8138, "num_input_tokens_seen": 11220616, "step": 19465 }, { "epoch": 2.899910634495085, "grad_norm": 0.27148205041885376, "learning_rate": 4.470154911991435e-05, "loss": 0.7725, "num_input_tokens_seen": 11223336, "step": 19470 }, { "epoch": 2.900655347036044, "grad_norm": 0.3269940912723541, "learning_rate": 4.469754779308839e-05, "loss": 0.7565, "num_input_tokens_seen": 11226376, "step": 19475 }, { "epoch": 2.9014000595770035, "grad_norm": 0.2944265604019165, "learning_rate": 4.4693545135182235e-05, "loss": 0.8122, "num_input_tokens_seen": 11229000, "step": 19480 }, { "epoch": 2.9021447721179623, "grad_norm": 0.2875283360481262, "learning_rate": 4.468954114646637e-05, "loss": 0.7999, "num_input_tokens_seen": 11232200, "step": 19485 }, { "epoch": 2.9028894846589215, "grad_norm": 0.30107858777046204, "learning_rate": 4.468553582721135e-05, "loss": 0.7808, "num_input_tokens_seen": 11235048, "step": 19490 }, { "epoch": 2.9036341971998807, "grad_norm": 0.32146134972572327, "learning_rate": 4.4681529177687876e-05, "loss": 0.8565, "num_input_tokens_seen": 11237768, "step": 19495 }, { "epoch": 2.90437890974084, "grad_norm": 0.39141711592674255, "learning_rate": 4.467752119816667e-05, "loss": 0.8257, "num_input_tokens_seen": 11240680, "step": 19500 }, { "epoch": 2.905123622281799, "grad_norm": 0.29865914583206177, "learning_rate": 4.467351188891858e-05, "loss": 0.8093, "num_input_tokens_seen": 11243688, "step": 19505 }, { "epoch": 2.9058683348227583, "grad_norm": 0.32335856556892395, "learning_rate": 4.466950125021455e-05, "loss": 0.8313, "num_input_tokens_seen": 11246408, "step": 19510 }, { "epoch": 2.9066130473637175, "grad_norm": 0.4277401864528656, "learning_rate": 4.466548928232559e-05, "loss": 0.8205, "num_input_tokens_seen": 11249512, "step": 19515 }, { "epoch": 2.9073577599046767, "grad_norm": 0.40695473551750183, "learning_rate": 4.4661475985522825e-05, "loss": 0.833, "num_input_tokens_seen": 11252392, "step": 19520 }, { "epoch": 2.908102472445636, "grad_norm": 0.3045642673969269, "learning_rate": 4.4657461360077444e-05, "loss": 0.817, "num_input_tokens_seen": 11255240, "step": 19525 }, { "epoch": 2.908847184986595, "grad_norm": 0.30218639969825745, "learning_rate": 4.4653445406260744e-05, "loss": 0.7998, "num_input_tokens_seen": 11257928, "step": 19530 }, { "epoch": 2.9095918975275543, "grad_norm": 0.397592157125473, "learning_rate": 4.4649428124344114e-05, "loss": 0.8271, "num_input_tokens_seen": 11260744, "step": 19535 }, { "epoch": 2.9103366100685135, "grad_norm": 0.29190102219581604, "learning_rate": 4.464540951459902e-05, "loss": 0.7989, "num_input_tokens_seen": 11263560, "step": 19540 }, { "epoch": 2.9110813226094727, "grad_norm": 0.2541291415691376, "learning_rate": 4.464138957729702e-05, "loss": 0.7734, "num_input_tokens_seen": 11266120, "step": 19545 }, { "epoch": 2.911826035150432, "grad_norm": 0.2650326192378998, "learning_rate": 4.463736831270977e-05, "loss": 0.8479, "num_input_tokens_seen": 11269032, "step": 19550 }, { "epoch": 2.912570747691391, "grad_norm": 0.26093757152557373, "learning_rate": 4.463334572110901e-05, "loss": 0.7855, "num_input_tokens_seen": 11271976, "step": 19555 }, { "epoch": 2.9133154602323503, "grad_norm": 0.3322306275367737, "learning_rate": 4.462932180276657e-05, "loss": 0.8206, "num_input_tokens_seen": 11275112, "step": 19560 }, { "epoch": 2.9140601727733095, "grad_norm": 0.19066835939884186, "learning_rate": 4.462529655795437e-05, "loss": 0.7912, "num_input_tokens_seen": 11277864, "step": 19565 }, { "epoch": 2.9148048853142687, "grad_norm": 0.27242976427078247, "learning_rate": 4.462126998694442e-05, "loss": 0.8086, "num_input_tokens_seen": 11280648, "step": 19570 }, { "epoch": 2.915549597855228, "grad_norm": 0.3412463665008545, "learning_rate": 4.4617242090008816e-05, "loss": 0.764, "num_input_tokens_seen": 11283304, "step": 19575 }, { "epoch": 2.916294310396187, "grad_norm": 0.31983819603919983, "learning_rate": 4.461321286741975e-05, "loss": 0.8291, "num_input_tokens_seen": 11286120, "step": 19580 }, { "epoch": 2.9170390229371463, "grad_norm": 0.3402256667613983, "learning_rate": 4.46091823194495e-05, "loss": 0.7841, "num_input_tokens_seen": 11288904, "step": 19585 }, { "epoch": 2.9177837354781055, "grad_norm": 0.3203829228878021, "learning_rate": 4.460515044637043e-05, "loss": 0.7752, "num_input_tokens_seen": 11291688, "step": 19590 }, { "epoch": 2.9185284480190647, "grad_norm": 0.3936595618724823, "learning_rate": 4.460111724845501e-05, "loss": 0.8206, "num_input_tokens_seen": 11294888, "step": 19595 }, { "epoch": 2.919273160560024, "grad_norm": 0.4119936227798462, "learning_rate": 4.4597082725975775e-05, "loss": 0.8432, "num_input_tokens_seen": 11297704, "step": 19600 }, { "epoch": 2.920017873100983, "grad_norm": 0.28459858894348145, "learning_rate": 4.459304687920536e-05, "loss": 0.8017, "num_input_tokens_seen": 11300680, "step": 19605 }, { "epoch": 2.9207625856419424, "grad_norm": 0.3393952250480652, "learning_rate": 4.458900970841651e-05, "loss": 0.8025, "num_input_tokens_seen": 11303592, "step": 19610 }, { "epoch": 2.9215072981829016, "grad_norm": 0.28814056515693665, "learning_rate": 4.4584971213882014e-05, "loss": 0.8306, "num_input_tokens_seen": 11306216, "step": 19615 }, { "epoch": 2.9222520107238603, "grad_norm": 0.3326828181743622, "learning_rate": 4.458093139587479e-05, "loss": 0.8163, "num_input_tokens_seen": 11309064, "step": 19620 }, { "epoch": 2.92299672326482, "grad_norm": 0.31842339038848877, "learning_rate": 4.4576890254667844e-05, "loss": 0.7997, "num_input_tokens_seen": 11311848, "step": 19625 }, { "epoch": 2.9237414358057787, "grad_norm": 0.257775217294693, "learning_rate": 4.457284779053423e-05, "loss": 0.7999, "num_input_tokens_seen": 11314792, "step": 19630 }, { "epoch": 2.9244861483467384, "grad_norm": 0.3471944332122803, "learning_rate": 4.4568804003747155e-05, "loss": 0.8117, "num_input_tokens_seen": 11317416, "step": 19635 }, { "epoch": 2.925230860887697, "grad_norm": 0.21161071956157684, "learning_rate": 4.4564758894579863e-05, "loss": 0.7768, "num_input_tokens_seen": 11320232, "step": 19640 }, { "epoch": 2.9259755734286568, "grad_norm": 0.31509512662887573, "learning_rate": 4.456071246330571e-05, "loss": 0.7789, "num_input_tokens_seen": 11323304, "step": 19645 }, { "epoch": 2.9267202859696155, "grad_norm": 0.2697053849697113, "learning_rate": 4.455666471019814e-05, "loss": 0.802, "num_input_tokens_seen": 11326216, "step": 19650 }, { "epoch": 2.927464998510575, "grad_norm": 0.26006191968917847, "learning_rate": 4.455261563553067e-05, "loss": 0.8094, "num_input_tokens_seen": 11329288, "step": 19655 }, { "epoch": 2.928209711051534, "grad_norm": 0.24722406268119812, "learning_rate": 4.454856523957694e-05, "loss": 0.8345, "num_input_tokens_seen": 11332136, "step": 19660 }, { "epoch": 2.928954423592493, "grad_norm": 0.2153398096561432, "learning_rate": 4.4544513522610644e-05, "loss": 0.7555, "num_input_tokens_seen": 11334952, "step": 19665 }, { "epoch": 2.9296991361334523, "grad_norm": 0.34096232056617737, "learning_rate": 4.454046048490559e-05, "loss": 0.8023, "num_input_tokens_seen": 11337736, "step": 19670 }, { "epoch": 2.9304438486744115, "grad_norm": 0.32455796003341675, "learning_rate": 4.4536406126735664e-05, "loss": 0.8444, "num_input_tokens_seen": 11340552, "step": 19675 }, { "epoch": 2.9311885612153707, "grad_norm": 0.29003822803497314, "learning_rate": 4.4532350448374835e-05, "loss": 0.8329, "num_input_tokens_seen": 11343464, "step": 19680 }, { "epoch": 2.93193327375633, "grad_norm": 0.27927595376968384, "learning_rate": 4.452829345009718e-05, "loss": 0.845, "num_input_tokens_seen": 11346312, "step": 19685 }, { "epoch": 2.932677986297289, "grad_norm": 0.22110795974731445, "learning_rate": 4.452423513217685e-05, "loss": 0.7755, "num_input_tokens_seen": 11349192, "step": 19690 }, { "epoch": 2.9334226988382484, "grad_norm": 0.2719155251979828, "learning_rate": 4.4520175494888086e-05, "loss": 0.8071, "num_input_tokens_seen": 11352008, "step": 19695 }, { "epoch": 2.9341674113792076, "grad_norm": 0.3557601571083069, "learning_rate": 4.4516114538505225e-05, "loss": 0.8367, "num_input_tokens_seen": 11354984, "step": 19700 }, { "epoch": 2.9349121239201668, "grad_norm": 0.30558446049690247, "learning_rate": 4.45120522633027e-05, "loss": 0.7951, "num_input_tokens_seen": 11357864, "step": 19705 }, { "epoch": 2.935656836461126, "grad_norm": 0.2450423240661621, "learning_rate": 4.4507988669555e-05, "loss": 0.8182, "num_input_tokens_seen": 11360552, "step": 19710 }, { "epoch": 2.936401549002085, "grad_norm": 0.3059905171394348, "learning_rate": 4.450392375753675e-05, "loss": 0.8196, "num_input_tokens_seen": 11363208, "step": 19715 }, { "epoch": 2.9371462615430444, "grad_norm": 0.2201198935508728, "learning_rate": 4.449985752752261e-05, "loss": 0.7647, "num_input_tokens_seen": 11366152, "step": 19720 }, { "epoch": 2.9378909740840036, "grad_norm": 0.3143206834793091, "learning_rate": 4.44957899797874e-05, "loss": 0.8265, "num_input_tokens_seen": 11369000, "step": 19725 }, { "epoch": 2.9386356866249628, "grad_norm": 0.27857309579849243, "learning_rate": 4.449172111460597e-05, "loss": 0.81, "num_input_tokens_seen": 11371784, "step": 19730 }, { "epoch": 2.939380399165922, "grad_norm": 0.2838647663593292, "learning_rate": 4.448765093225326e-05, "loss": 0.8293, "num_input_tokens_seen": 11374856, "step": 19735 }, { "epoch": 2.940125111706881, "grad_norm": 0.28142017126083374, "learning_rate": 4.448357943300434e-05, "loss": 0.783, "num_input_tokens_seen": 11377800, "step": 19740 }, { "epoch": 2.9408698242478404, "grad_norm": 0.3084962069988251, "learning_rate": 4.4479506617134324e-05, "loss": 0.8263, "num_input_tokens_seen": 11380360, "step": 19745 }, { "epoch": 2.9416145367887996, "grad_norm": 0.38856858015060425, "learning_rate": 4.447543248491846e-05, "loss": 0.8369, "num_input_tokens_seen": 11383624, "step": 19750 }, { "epoch": 2.942359249329759, "grad_norm": 0.32892248034477234, "learning_rate": 4.447135703663205e-05, "loss": 0.7932, "num_input_tokens_seen": 11386568, "step": 19755 }, { "epoch": 2.943103961870718, "grad_norm": 0.36291617155075073, "learning_rate": 4.4467280272550495e-05, "loss": 0.859, "num_input_tokens_seen": 11389608, "step": 19760 }, { "epoch": 2.943848674411677, "grad_norm": 0.3122326135635376, "learning_rate": 4.4463202192949284e-05, "loss": 0.8259, "num_input_tokens_seen": 11393768, "step": 19765 }, { "epoch": 2.9445933869526364, "grad_norm": 0.3244507908821106, "learning_rate": 4.4459122798104004e-05, "loss": 0.8001, "num_input_tokens_seen": 11396840, "step": 19770 }, { "epoch": 2.9453380994935956, "grad_norm": 0.31402361392974854, "learning_rate": 4.445504208829032e-05, "loss": 0.7917, "num_input_tokens_seen": 11399848, "step": 19775 }, { "epoch": 2.946082812034555, "grad_norm": 0.30729684233665466, "learning_rate": 4.445096006378399e-05, "loss": 0.829, "num_input_tokens_seen": 11403016, "step": 19780 }, { "epoch": 2.946827524575514, "grad_norm": 0.2988283038139343, "learning_rate": 4.4446876724860856e-05, "loss": 0.7748, "num_input_tokens_seen": 11405960, "step": 19785 }, { "epoch": 2.947572237116473, "grad_norm": 0.25100693106651306, "learning_rate": 4.444279207179687e-05, "loss": 0.7794, "num_input_tokens_seen": 11408776, "step": 19790 }, { "epoch": 2.948316949657432, "grad_norm": 0.33261850476264954, "learning_rate": 4.443870610486803e-05, "loss": 0.7983, "num_input_tokens_seen": 11411784, "step": 19795 }, { "epoch": 2.9490616621983916, "grad_norm": 0.22322657704353333, "learning_rate": 4.4434618824350475e-05, "loss": 0.7915, "num_input_tokens_seen": 11414248, "step": 19800 }, { "epoch": 2.9498063747393504, "grad_norm": 0.23006707429885864, "learning_rate": 4.4430530230520386e-05, "loss": 0.7978, "num_input_tokens_seen": 11417032, "step": 19805 }, { "epoch": 2.95055108728031, "grad_norm": 0.2520967721939087, "learning_rate": 4.442644032365407e-05, "loss": 0.7713, "num_input_tokens_seen": 11419784, "step": 19810 }, { "epoch": 2.9512957998212688, "grad_norm": 0.330416202545166, "learning_rate": 4.4422349104027895e-05, "loss": 0.7747, "num_input_tokens_seen": 11422792, "step": 19815 }, { "epoch": 2.9520405123622284, "grad_norm": 0.32140931487083435, "learning_rate": 4.4418256571918334e-05, "loss": 0.8539, "num_input_tokens_seen": 11425736, "step": 19820 }, { "epoch": 2.952785224903187, "grad_norm": 0.26388803124427795, "learning_rate": 4.441416272760194e-05, "loss": 0.7967, "num_input_tokens_seen": 11428360, "step": 19825 }, { "epoch": 2.953529937444147, "grad_norm": 0.23982499539852142, "learning_rate": 4.441006757135536e-05, "loss": 0.8026, "num_input_tokens_seen": 11431176, "step": 19830 }, { "epoch": 2.9542746499851056, "grad_norm": 0.25529804825782776, "learning_rate": 4.440597110345533e-05, "loss": 0.8063, "num_input_tokens_seen": 11434184, "step": 19835 }, { "epoch": 2.955019362526065, "grad_norm": 0.2783116400241852, "learning_rate": 4.4401873324178684e-05, "loss": 0.7854, "num_input_tokens_seen": 11437224, "step": 19840 }, { "epoch": 2.955764075067024, "grad_norm": 0.3213939368724823, "learning_rate": 4.439777423380231e-05, "loss": 0.8218, "num_input_tokens_seen": 11439816, "step": 19845 }, { "epoch": 2.956508787607983, "grad_norm": 0.27095308899879456, "learning_rate": 4.439367383260322e-05, "loss": 0.7813, "num_input_tokens_seen": 11442536, "step": 19850 }, { "epoch": 2.9572535001489424, "grad_norm": 0.38060081005096436, "learning_rate": 4.4389572120858506e-05, "loss": 0.7787, "num_input_tokens_seen": 11445800, "step": 19855 }, { "epoch": 2.9579982126899016, "grad_norm": 0.22235272824764252, "learning_rate": 4.4385469098845335e-05, "loss": 0.7971, "num_input_tokens_seen": 11448424, "step": 19860 }, { "epoch": 2.958742925230861, "grad_norm": 0.35102978348731995, "learning_rate": 4.438136476684098e-05, "loss": 0.8305, "num_input_tokens_seen": 11451240, "step": 19865 }, { "epoch": 2.95948763777182, "grad_norm": 0.2592136859893799, "learning_rate": 4.4377259125122786e-05, "loss": 0.8243, "num_input_tokens_seen": 11454184, "step": 19870 }, { "epoch": 2.960232350312779, "grad_norm": 0.2955082058906555, "learning_rate": 4.4373152173968214e-05, "loss": 0.8077, "num_input_tokens_seen": 11456712, "step": 19875 }, { "epoch": 2.9609770628537384, "grad_norm": 0.23093551397323608, "learning_rate": 4.436904391365477e-05, "loss": 0.797, "num_input_tokens_seen": 11459656, "step": 19880 }, { "epoch": 2.9617217753946976, "grad_norm": 0.26485922932624817, "learning_rate": 4.43649343444601e-05, "loss": 0.7834, "num_input_tokens_seen": 11462440, "step": 19885 }, { "epoch": 2.962466487935657, "grad_norm": 0.2404690980911255, "learning_rate": 4.436082346666189e-05, "loss": 0.7808, "num_input_tokens_seen": 11465288, "step": 19890 }, { "epoch": 2.963211200476616, "grad_norm": 0.4577438533306122, "learning_rate": 4.4356711280537954e-05, "loss": 0.8451, "num_input_tokens_seen": 11468392, "step": 19895 }, { "epoch": 2.963955913017575, "grad_norm": 0.3346003293991089, "learning_rate": 4.435259778636617e-05, "loss": 0.8174, "num_input_tokens_seen": 11471272, "step": 19900 }, { "epoch": 2.9647006255585344, "grad_norm": 0.21846090257167816, "learning_rate": 4.43484829844245e-05, "loss": 0.8333, "num_input_tokens_seen": 11474216, "step": 19905 }, { "epoch": 2.9654453380994936, "grad_norm": 0.24203184247016907, "learning_rate": 4.434436687499102e-05, "loss": 0.8164, "num_input_tokens_seen": 11476776, "step": 19910 }, { "epoch": 2.966190050640453, "grad_norm": 0.27383163571357727, "learning_rate": 4.434024945834387e-05, "loss": 0.8177, "num_input_tokens_seen": 11479592, "step": 19915 }, { "epoch": 2.966934763181412, "grad_norm": 0.32537323236465454, "learning_rate": 4.43361307347613e-05, "loss": 0.797, "num_input_tokens_seen": 11482632, "step": 19920 }, { "epoch": 2.9676794757223712, "grad_norm": 0.32070866227149963, "learning_rate": 4.433201070452163e-05, "loss": 0.7768, "num_input_tokens_seen": 11485736, "step": 19925 }, { "epoch": 2.9684241882633304, "grad_norm": 0.3071169853210449, "learning_rate": 4.432788936790327e-05, "loss": 0.8192, "num_input_tokens_seen": 11488456, "step": 19930 }, { "epoch": 2.9691689008042896, "grad_norm": 0.28016266226768494, "learning_rate": 4.432376672518473e-05, "loss": 0.8171, "num_input_tokens_seen": 11491528, "step": 19935 }, { "epoch": 2.969913613345249, "grad_norm": 0.2529054582118988, "learning_rate": 4.43196427766446e-05, "loss": 0.7837, "num_input_tokens_seen": 11494408, "step": 19940 }, { "epoch": 2.970658325886208, "grad_norm": 0.2800288796424866, "learning_rate": 4.431551752256155e-05, "loss": 0.8021, "num_input_tokens_seen": 11497128, "step": 19945 }, { "epoch": 2.9714030384271672, "grad_norm": 0.30075177550315857, "learning_rate": 4.4311390963214375e-05, "loss": 0.8274, "num_input_tokens_seen": 11499912, "step": 19950 }, { "epoch": 2.9721477509681264, "grad_norm": 0.24156558513641357, "learning_rate": 4.43072630988819e-05, "loss": 0.7951, "num_input_tokens_seen": 11502696, "step": 19955 }, { "epoch": 2.9728924635090856, "grad_norm": 0.24841217696666718, "learning_rate": 4.4303133929843086e-05, "loss": 0.808, "num_input_tokens_seen": 11505512, "step": 19960 }, { "epoch": 2.973637176050045, "grad_norm": 0.25978222489356995, "learning_rate": 4.4299003456376966e-05, "loss": 0.8037, "num_input_tokens_seen": 11508616, "step": 19965 }, { "epoch": 2.9743818885910036, "grad_norm": 0.2673286497592926, "learning_rate": 4.429487167876265e-05, "loss": 0.7943, "num_input_tokens_seen": 11511368, "step": 19970 }, { "epoch": 2.9751266011319633, "grad_norm": 0.2416476458311081, "learning_rate": 4.429073859727936e-05, "loss": 0.7759, "num_input_tokens_seen": 11514504, "step": 19975 }, { "epoch": 2.975871313672922, "grad_norm": 0.2682877779006958, "learning_rate": 4.428660421220638e-05, "loss": 0.8083, "num_input_tokens_seen": 11517320, "step": 19980 }, { "epoch": 2.9766160262138817, "grad_norm": 0.27737540006637573, "learning_rate": 4.42824685238231e-05, "loss": 0.8178, "num_input_tokens_seen": 11520296, "step": 19985 }, { "epoch": 2.9773607387548404, "grad_norm": 0.33325788378715515, "learning_rate": 4.4278331532409e-05, "loss": 0.8103, "num_input_tokens_seen": 11523208, "step": 19990 }, { "epoch": 2.9781054512958, "grad_norm": 0.3844397962093353, "learning_rate": 4.427419323824363e-05, "loss": 0.8139, "num_input_tokens_seen": 11526280, "step": 19995 }, { "epoch": 2.978850163836759, "grad_norm": 0.4069511294364929, "learning_rate": 4.427005364160665e-05, "loss": 0.8164, "num_input_tokens_seen": 11528840, "step": 20000 }, { "epoch": 2.9795948763777185, "grad_norm": 0.2685054540634155, "learning_rate": 4.426591274277778e-05, "loss": 0.7875, "num_input_tokens_seen": 11531624, "step": 20005 }, { "epoch": 2.9803395889186772, "grad_norm": 0.31653037667274475, "learning_rate": 4.426177054203686e-05, "loss": 0.8238, "num_input_tokens_seen": 11534504, "step": 20010 }, { "epoch": 2.9810843014596364, "grad_norm": 0.37335795164108276, "learning_rate": 4.425762703966381e-05, "loss": 0.8166, "num_input_tokens_seen": 11537448, "step": 20015 }, { "epoch": 2.9818290140005956, "grad_norm": 0.25169792771339417, "learning_rate": 4.425348223593861e-05, "loss": 0.8053, "num_input_tokens_seen": 11540360, "step": 20020 }, { "epoch": 2.982573726541555, "grad_norm": 0.22289524972438812, "learning_rate": 4.424933613114136e-05, "loss": 0.8138, "num_input_tokens_seen": 11543304, "step": 20025 }, { "epoch": 2.983318439082514, "grad_norm": 0.3636139929294586, "learning_rate": 4.424518872555224e-05, "loss": 0.818, "num_input_tokens_seen": 11546376, "step": 20030 }, { "epoch": 2.9840631516234732, "grad_norm": 0.2537362277507782, "learning_rate": 4.424104001945151e-05, "loss": 0.7877, "num_input_tokens_seen": 11549224, "step": 20035 }, { "epoch": 2.9848078641644324, "grad_norm": 0.22033096849918365, "learning_rate": 4.4236890013119527e-05, "loss": 0.8137, "num_input_tokens_seen": 11552008, "step": 20040 }, { "epoch": 2.9855525767053916, "grad_norm": 0.18909414112567902, "learning_rate": 4.423273870683672e-05, "loss": 0.7946, "num_input_tokens_seen": 11554792, "step": 20045 }, { "epoch": 2.986297289246351, "grad_norm": 0.22720979154109955, "learning_rate": 4.422858610088364e-05, "loss": 0.8016, "num_input_tokens_seen": 11557736, "step": 20050 }, { "epoch": 2.98704200178731, "grad_norm": 0.26330605149269104, "learning_rate": 4.422443219554088e-05, "loss": 0.8096, "num_input_tokens_seen": 11560392, "step": 20055 }, { "epoch": 2.9877867143282693, "grad_norm": 0.4344521164894104, "learning_rate": 4.422027699108915e-05, "loss": 0.8143, "num_input_tokens_seen": 11563720, "step": 20060 }, { "epoch": 2.9885314268692285, "grad_norm": 0.265003502368927, "learning_rate": 4.421612048780925e-05, "loss": 0.8054, "num_input_tokens_seen": 11566696, "step": 20065 }, { "epoch": 2.9892761394101877, "grad_norm": 0.29474076628685, "learning_rate": 4.421196268598205e-05, "loss": 0.7914, "num_input_tokens_seen": 11569448, "step": 20070 }, { "epoch": 2.990020851951147, "grad_norm": 0.2615264356136322, "learning_rate": 4.4207803585888524e-05, "loss": 0.7822, "num_input_tokens_seen": 11572328, "step": 20075 }, { "epoch": 2.990765564492106, "grad_norm": 0.27894771099090576, "learning_rate": 4.420364318780973e-05, "loss": 0.79, "num_input_tokens_seen": 11575304, "step": 20080 }, { "epoch": 2.9915102770330653, "grad_norm": 0.31638655066490173, "learning_rate": 4.419948149202679e-05, "loss": 0.8213, "num_input_tokens_seen": 11577960, "step": 20085 }, { "epoch": 2.9922549895740245, "grad_norm": 0.2157943993806839, "learning_rate": 4.419531849882097e-05, "loss": 0.7852, "num_input_tokens_seen": 11580680, "step": 20090 }, { "epoch": 2.9929997021149837, "grad_norm": 0.26032736897468567, "learning_rate": 4.419115420847356e-05, "loss": 0.8107, "num_input_tokens_seen": 11583464, "step": 20095 }, { "epoch": 2.993744414655943, "grad_norm": 0.30422061681747437, "learning_rate": 4.418698862126597e-05, "loss": 0.7565, "num_input_tokens_seen": 11586376, "step": 20100 }, { "epoch": 2.994489127196902, "grad_norm": 0.23141948878765106, "learning_rate": 4.418282173747971e-05, "loss": 0.8145, "num_input_tokens_seen": 11589480, "step": 20105 }, { "epoch": 2.9952338397378613, "grad_norm": 0.5104389786720276, "learning_rate": 4.4178653557396335e-05, "loss": 0.821, "num_input_tokens_seen": 11592840, "step": 20110 }, { "epoch": 2.9959785522788205, "grad_norm": 0.24096645414829254, "learning_rate": 4.417448408129753e-05, "loss": 0.8168, "num_input_tokens_seen": 11595688, "step": 20115 }, { "epoch": 2.9967232648197797, "grad_norm": 0.28859227895736694, "learning_rate": 4.417031330946505e-05, "loss": 0.8225, "num_input_tokens_seen": 11598920, "step": 20120 }, { "epoch": 2.997467977360739, "grad_norm": 0.30888548493385315, "learning_rate": 4.4166141242180736e-05, "loss": 0.812, "num_input_tokens_seen": 11601672, "step": 20125 }, { "epoch": 2.998212689901698, "grad_norm": 0.24181707203388214, "learning_rate": 4.4161967879726526e-05, "loss": 0.813, "num_input_tokens_seen": 11604680, "step": 20130 }, { "epoch": 2.9989574024426573, "grad_norm": 0.3624911904335022, "learning_rate": 4.415779322238443e-05, "loss": 0.7996, "num_input_tokens_seen": 11607688, "step": 20135 }, { "epoch": 2.9997021149836165, "grad_norm": 0.26177090406417847, "learning_rate": 4.4153617270436556e-05, "loss": 0.8009, "num_input_tokens_seen": 11610440, "step": 20140 }, { "epoch": 3.0, "eval_loss": 0.8064948320388794, "eval_runtime": 45.4281, "eval_samples_per_second": 65.686, "eval_steps_per_second": 16.422, "num_input_tokens_seen": 11611120, "step": 20142 }, { "epoch": 3.0004468275245757, "grad_norm": 0.37299102544784546, "learning_rate": 4.414944002416511e-05, "loss": 0.7934, "num_input_tokens_seen": 11612848, "step": 20145 }, { "epoch": 3.001191540065535, "grad_norm": 0.2938947379589081, "learning_rate": 4.414526148385235e-05, "loss": 0.7879, "num_input_tokens_seen": 11615664, "step": 20150 }, { "epoch": 3.001936252606494, "grad_norm": 0.24346882104873657, "learning_rate": 4.414108164978067e-05, "loss": 0.7945, "num_input_tokens_seen": 11618480, "step": 20155 }, { "epoch": 3.002680965147453, "grad_norm": 0.3412632644176483, "learning_rate": 4.4136900522232506e-05, "loss": 0.8192, "num_input_tokens_seen": 11621168, "step": 20160 }, { "epoch": 3.003425677688412, "grad_norm": 0.22034546732902527, "learning_rate": 4.413271810149041e-05, "loss": 0.8169, "num_input_tokens_seen": 11624240, "step": 20165 }, { "epoch": 3.0041703902293713, "grad_norm": 0.27503716945648193, "learning_rate": 4.412853438783701e-05, "loss": 0.7808, "num_input_tokens_seen": 11627312, "step": 20170 }, { "epoch": 3.0049151027703305, "grad_norm": 0.22994175553321838, "learning_rate": 4.412434938155503e-05, "loss": 0.8021, "num_input_tokens_seen": 11630256, "step": 20175 }, { "epoch": 3.0056598153112897, "grad_norm": 0.25569695234298706, "learning_rate": 4.4120163082927274e-05, "loss": 0.8007, "num_input_tokens_seen": 11632880, "step": 20180 }, { "epoch": 3.006404527852249, "grad_norm": 0.22010023891925812, "learning_rate": 4.411597549223663e-05, "loss": 0.8036, "num_input_tokens_seen": 11635696, "step": 20185 }, { "epoch": 3.007149240393208, "grad_norm": 0.24192191660404205, "learning_rate": 4.411178660976609e-05, "loss": 0.8188, "num_input_tokens_seen": 11638576, "step": 20190 }, { "epoch": 3.0078939529341673, "grad_norm": 0.3655867576599121, "learning_rate": 4.410759643579871e-05, "loss": 0.8172, "num_input_tokens_seen": 11641840, "step": 20195 }, { "epoch": 3.0086386654751265, "grad_norm": 0.45838266611099243, "learning_rate": 4.410340497061764e-05, "loss": 0.8209, "num_input_tokens_seen": 11644432, "step": 20200 }, { "epoch": 3.0093833780160857, "grad_norm": 0.25374147295951843, "learning_rate": 4.4099212214506146e-05, "loss": 0.8247, "num_input_tokens_seen": 11647248, "step": 20205 }, { "epoch": 3.010128090557045, "grad_norm": 0.23431451618671417, "learning_rate": 4.4095018167747536e-05, "loss": 0.797, "num_input_tokens_seen": 11650256, "step": 20210 }, { "epoch": 3.010872803098004, "grad_norm": 0.3103838860988617, "learning_rate": 4.4090822830625236e-05, "loss": 0.7715, "num_input_tokens_seen": 11653104, "step": 20215 }, { "epoch": 3.0116175156389633, "grad_norm": 0.27086305618286133, "learning_rate": 4.408662620342274e-05, "loss": 0.7926, "num_input_tokens_seen": 11655952, "step": 20220 }, { "epoch": 3.0123622281799225, "grad_norm": 0.3528328835964203, "learning_rate": 4.408242828642365e-05, "loss": 0.8014, "num_input_tokens_seen": 11658960, "step": 20225 }, { "epoch": 3.0131069407208817, "grad_norm": 0.33923548460006714, "learning_rate": 4.4078229079911636e-05, "loss": 0.8265, "num_input_tokens_seen": 11661840, "step": 20230 }, { "epoch": 3.013851653261841, "grad_norm": 0.26749706268310547, "learning_rate": 4.407402858417047e-05, "loss": 0.8277, "num_input_tokens_seen": 11664848, "step": 20235 }, { "epoch": 3.0145963658028, "grad_norm": 0.23655933141708374, "learning_rate": 4.4069826799484e-05, "loss": 0.821, "num_input_tokens_seen": 11667632, "step": 20240 }, { "epoch": 3.0153410783437593, "grad_norm": 0.35847043991088867, "learning_rate": 4.406562372613617e-05, "loss": 0.794, "num_input_tokens_seen": 11670544, "step": 20245 }, { "epoch": 3.0160857908847185, "grad_norm": 0.2385745793581009, "learning_rate": 4.406141936441099e-05, "loss": 0.8106, "num_input_tokens_seen": 11673392, "step": 20250 }, { "epoch": 3.0168305034256777, "grad_norm": 0.3003644645214081, "learning_rate": 4.40572137145926e-05, "loss": 0.7888, "num_input_tokens_seen": 11676464, "step": 20255 }, { "epoch": 3.017575215966637, "grad_norm": 0.21484322845935822, "learning_rate": 4.405300677696519e-05, "loss": 0.8143, "num_input_tokens_seen": 11679248, "step": 20260 }, { "epoch": 3.018319928507596, "grad_norm": 0.31254902482032776, "learning_rate": 4.4048798551813056e-05, "loss": 0.8381, "num_input_tokens_seen": 11682032, "step": 20265 }, { "epoch": 3.0190646410485553, "grad_norm": 0.24132144451141357, "learning_rate": 4.4044589039420546e-05, "loss": 0.7721, "num_input_tokens_seen": 11684752, "step": 20270 }, { "epoch": 3.0198093535895145, "grad_norm": 0.3036220669746399, "learning_rate": 4.404037824007214e-05, "loss": 0.7943, "num_input_tokens_seen": 11687536, "step": 20275 }, { "epoch": 3.0205540661304737, "grad_norm": 0.24373604357242584, "learning_rate": 4.4036166154052387e-05, "loss": 0.8116, "num_input_tokens_seen": 11690736, "step": 20280 }, { "epoch": 3.021298778671433, "grad_norm": 0.334964394569397, "learning_rate": 4.4031952781645924e-05, "loss": 0.8142, "num_input_tokens_seen": 11693904, "step": 20285 }, { "epoch": 3.022043491212392, "grad_norm": 0.25498291850090027, "learning_rate": 4.4027738123137465e-05, "loss": 0.7881, "num_input_tokens_seen": 11696912, "step": 20290 }, { "epoch": 3.0227882037533513, "grad_norm": 0.25651490688323975, "learning_rate": 4.402352217881183e-05, "loss": 0.8086, "num_input_tokens_seen": 11699920, "step": 20295 }, { "epoch": 3.0235329162943105, "grad_norm": 0.24753305315971375, "learning_rate": 4.4019304948953906e-05, "loss": 0.804, "num_input_tokens_seen": 11702960, "step": 20300 }, { "epoch": 3.0242776288352697, "grad_norm": 0.28693631291389465, "learning_rate": 4.401508643384868e-05, "loss": 0.8299, "num_input_tokens_seen": 11706000, "step": 20305 }, { "epoch": 3.025022341376229, "grad_norm": 0.377998411655426, "learning_rate": 4.4010866633781225e-05, "loss": 0.7905, "num_input_tokens_seen": 11708880, "step": 20310 }, { "epoch": 3.025767053917188, "grad_norm": 0.3191511332988739, "learning_rate": 4.4006645549036697e-05, "loss": 0.8073, "num_input_tokens_seen": 11712560, "step": 20315 }, { "epoch": 3.0265117664581473, "grad_norm": 0.21377511322498322, "learning_rate": 4.400242317990033e-05, "loss": 0.7974, "num_input_tokens_seen": 11715856, "step": 20320 }, { "epoch": 3.0272564789991065, "grad_norm": 0.29284897446632385, "learning_rate": 4.399819952665747e-05, "loss": 0.7913, "num_input_tokens_seen": 11718832, "step": 20325 }, { "epoch": 3.0280011915400658, "grad_norm": 0.2588324248790741, "learning_rate": 4.399397458959353e-05, "loss": 0.8254, "num_input_tokens_seen": 11721456, "step": 20330 }, { "epoch": 3.0287459040810245, "grad_norm": 0.26923444867134094, "learning_rate": 4.398974836899401e-05, "loss": 0.8019, "num_input_tokens_seen": 11724144, "step": 20335 }, { "epoch": 3.0294906166219837, "grad_norm": 0.29855969548225403, "learning_rate": 4.398552086514449e-05, "loss": 0.7918, "num_input_tokens_seen": 11726928, "step": 20340 }, { "epoch": 3.030235329162943, "grad_norm": 0.2493988573551178, "learning_rate": 4.398129207833067e-05, "loss": 0.7964, "num_input_tokens_seen": 11729648, "step": 20345 }, { "epoch": 3.030980041703902, "grad_norm": 0.26848480105400085, "learning_rate": 4.3977062008838307e-05, "loss": 0.8156, "num_input_tokens_seen": 11732336, "step": 20350 }, { "epoch": 3.0317247542448613, "grad_norm": 0.277130663394928, "learning_rate": 4.397283065695325e-05, "loss": 0.7733, "num_input_tokens_seen": 11735280, "step": 20355 }, { "epoch": 3.0324694667858205, "grad_norm": 0.3369447588920593, "learning_rate": 4.396859802296142e-05, "loss": 0.8116, "num_input_tokens_seen": 11738192, "step": 20360 }, { "epoch": 3.0332141793267797, "grad_norm": 0.27125632762908936, "learning_rate": 4.396436410714887e-05, "loss": 0.7842, "num_input_tokens_seen": 11741232, "step": 20365 }, { "epoch": 3.033958891867739, "grad_norm": 0.40566369891166687, "learning_rate": 4.396012890980169e-05, "loss": 0.8045, "num_input_tokens_seen": 11744592, "step": 20370 }, { "epoch": 3.034703604408698, "grad_norm": 0.24872224032878876, "learning_rate": 4.3955892431206085e-05, "loss": 0.7864, "num_input_tokens_seen": 11747344, "step": 20375 }, { "epoch": 3.0354483169496573, "grad_norm": 0.39916789531707764, "learning_rate": 4.395165467164834e-05, "loss": 0.7882, "num_input_tokens_seen": 11750064, "step": 20380 }, { "epoch": 3.0361930294906165, "grad_norm": 0.24876856803894043, "learning_rate": 4.394741563141482e-05, "loss": 0.7852, "num_input_tokens_seen": 11753104, "step": 20385 }, { "epoch": 3.0369377420315757, "grad_norm": 0.23288846015930176, "learning_rate": 4.3943175310791995e-05, "loss": 0.78, "num_input_tokens_seen": 11755792, "step": 20390 }, { "epoch": 3.037682454572535, "grad_norm": 0.42947280406951904, "learning_rate": 4.3938933710066396e-05, "loss": 0.7842, "num_input_tokens_seen": 11758768, "step": 20395 }, { "epoch": 3.038427167113494, "grad_norm": 0.27844369411468506, "learning_rate": 4.393469082952466e-05, "loss": 0.7964, "num_input_tokens_seen": 11761648, "step": 20400 }, { "epoch": 3.0391718796544533, "grad_norm": 0.2321588695049286, "learning_rate": 4.3930446669453494e-05, "loss": 0.8003, "num_input_tokens_seen": 11764656, "step": 20405 }, { "epoch": 3.0399165921954125, "grad_norm": 0.25328850746154785, "learning_rate": 4.392620123013971e-05, "loss": 0.8272, "num_input_tokens_seen": 11767440, "step": 20410 }, { "epoch": 3.0406613047363718, "grad_norm": 0.27782386541366577, "learning_rate": 4.3921954511870194e-05, "loss": 0.8032, "num_input_tokens_seen": 11770704, "step": 20415 }, { "epoch": 3.041406017277331, "grad_norm": 0.42006343603134155, "learning_rate": 4.3917706514931926e-05, "loss": 0.8549, "num_input_tokens_seen": 11773552, "step": 20420 }, { "epoch": 3.04215072981829, "grad_norm": 0.25786417722702026, "learning_rate": 4.391345723961197e-05, "loss": 0.81, "num_input_tokens_seen": 11776400, "step": 20425 }, { "epoch": 3.0428954423592494, "grad_norm": 0.24364995956420898, "learning_rate": 4.3909206686197456e-05, "loss": 0.7922, "num_input_tokens_seen": 11779280, "step": 20430 }, { "epoch": 3.0436401549002086, "grad_norm": 0.32118168473243713, "learning_rate": 4.3904954854975644e-05, "loss": 0.8057, "num_input_tokens_seen": 11782416, "step": 20435 }, { "epoch": 3.0443848674411678, "grad_norm": 0.23282282054424286, "learning_rate": 4.390070174623384e-05, "loss": 0.834, "num_input_tokens_seen": 11785296, "step": 20440 }, { "epoch": 3.045129579982127, "grad_norm": 0.2550814151763916, "learning_rate": 4.389644736025946e-05, "loss": 0.79, "num_input_tokens_seen": 11788016, "step": 20445 }, { "epoch": 3.045874292523086, "grad_norm": 0.19949358701705933, "learning_rate": 4.389219169734e-05, "loss": 0.8091, "num_input_tokens_seen": 11790832, "step": 20450 }, { "epoch": 3.0466190050640454, "grad_norm": 0.33977556228637695, "learning_rate": 4.388793475776303e-05, "loss": 0.7979, "num_input_tokens_seen": 11793776, "step": 20455 }, { "epoch": 3.0473637176050046, "grad_norm": 0.33376947045326233, "learning_rate": 4.388367654181622e-05, "loss": 0.8046, "num_input_tokens_seen": 11796720, "step": 20460 }, { "epoch": 3.0481084301459638, "grad_norm": 0.320339560508728, "learning_rate": 4.387941704978733e-05, "loss": 0.774, "num_input_tokens_seen": 11800528, "step": 20465 }, { "epoch": 3.048853142686923, "grad_norm": 0.3236607611179352, "learning_rate": 4.3875156281964186e-05, "loss": 0.829, "num_input_tokens_seen": 11803184, "step": 20470 }, { "epoch": 3.049597855227882, "grad_norm": 0.3162200450897217, "learning_rate": 4.3870894238634725e-05, "loss": 0.8423, "num_input_tokens_seen": 11805680, "step": 20475 }, { "epoch": 3.0503425677688414, "grad_norm": 0.2603805661201477, "learning_rate": 4.386663092008696e-05, "loss": 0.8285, "num_input_tokens_seen": 11808400, "step": 20480 }, { "epoch": 3.0510872803098006, "grad_norm": 0.40922075510025024, "learning_rate": 4.3862366326608975e-05, "loss": 0.7906, "num_input_tokens_seen": 11811376, "step": 20485 }, { "epoch": 3.05183199285076, "grad_norm": 0.2295440286397934, "learning_rate": 4.385810045848896e-05, "loss": 0.7639, "num_input_tokens_seen": 11814096, "step": 20490 }, { "epoch": 3.052576705391719, "grad_norm": 0.3264563977718353, "learning_rate": 4.38538333160152e-05, "loss": 0.7743, "num_input_tokens_seen": 11816848, "step": 20495 }, { "epoch": 3.053321417932678, "grad_norm": 0.3456261456012726, "learning_rate": 4.3849564899476026e-05, "loss": 0.8062, "num_input_tokens_seen": 11819568, "step": 20500 }, { "epoch": 3.054066130473637, "grad_norm": 0.26652592420578003, "learning_rate": 4.38452952091599e-05, "loss": 0.8092, "num_input_tokens_seen": 11822768, "step": 20505 }, { "epoch": 3.054810843014596, "grad_norm": 0.3586704432964325, "learning_rate": 4.3841024245355346e-05, "loss": 0.8147, "num_input_tokens_seen": 11825808, "step": 20510 }, { "epoch": 3.0555555555555554, "grad_norm": 0.293451189994812, "learning_rate": 4.383675200835097e-05, "loss": 0.7839, "num_input_tokens_seen": 11828720, "step": 20515 }, { "epoch": 3.0563002680965146, "grad_norm": 0.35792505741119385, "learning_rate": 4.383247849843548e-05, "loss": 0.7958, "num_input_tokens_seen": 11831568, "step": 20520 }, { "epoch": 3.0570449806374738, "grad_norm": 0.3176777958869934, "learning_rate": 4.382820371589766e-05, "loss": 0.807, "num_input_tokens_seen": 11834192, "step": 20525 }, { "epoch": 3.057789693178433, "grad_norm": 0.28406262397766113, "learning_rate": 4.382392766102638e-05, "loss": 0.7526, "num_input_tokens_seen": 11837040, "step": 20530 }, { "epoch": 3.058534405719392, "grad_norm": 0.26700305938720703, "learning_rate": 4.381965033411061e-05, "loss": 0.7985, "num_input_tokens_seen": 11839888, "step": 20535 }, { "epoch": 3.0592791182603514, "grad_norm": 0.22757571935653687, "learning_rate": 4.381537173543937e-05, "loss": 0.7467, "num_input_tokens_seen": 11843024, "step": 20540 }, { "epoch": 3.0600238308013106, "grad_norm": 0.3043833076953888, "learning_rate": 4.381109186530182e-05, "loss": 0.7328, "num_input_tokens_seen": 11845744, "step": 20545 }, { "epoch": 3.0607685433422698, "grad_norm": 0.21605098247528076, "learning_rate": 4.380681072398716e-05, "loss": 0.7941, "num_input_tokens_seen": 11848432, "step": 20550 }, { "epoch": 3.061513255883229, "grad_norm": 0.2610557973384857, "learning_rate": 4.3802528311784686e-05, "loss": 0.8073, "num_input_tokens_seen": 11851728, "step": 20555 }, { "epoch": 3.062257968424188, "grad_norm": 0.3823758065700531, "learning_rate": 4.37982446289838e-05, "loss": 0.7954, "num_input_tokens_seen": 11854544, "step": 20560 }, { "epoch": 3.0630026809651474, "grad_norm": 0.28575268387794495, "learning_rate": 4.379395967587398e-05, "loss": 0.7951, "num_input_tokens_seen": 11857840, "step": 20565 }, { "epoch": 3.0637473935061066, "grad_norm": 0.2950553596019745, "learning_rate": 4.378967345274476e-05, "loss": 0.8233, "num_input_tokens_seen": 11860784, "step": 20570 }, { "epoch": 3.064492106047066, "grad_norm": 0.2747868001461029, "learning_rate": 4.3785385959885805e-05, "loss": 0.7947, "num_input_tokens_seen": 11863824, "step": 20575 }, { "epoch": 3.065236818588025, "grad_norm": 0.24721956253051758, "learning_rate": 4.3781097197586845e-05, "loss": 0.8254, "num_input_tokens_seen": 11866352, "step": 20580 }, { "epoch": 3.065981531128984, "grad_norm": 0.3724307119846344, "learning_rate": 4.377680716613769e-05, "loss": 0.8604, "num_input_tokens_seen": 11869200, "step": 20585 }, { "epoch": 3.0667262436699434, "grad_norm": 0.2689606845378876, "learning_rate": 4.377251586582826e-05, "loss": 0.7818, "num_input_tokens_seen": 11871984, "step": 20590 }, { "epoch": 3.0674709562109026, "grad_norm": 0.3007524311542511, "learning_rate": 4.3768223296948516e-05, "loss": 0.8041, "num_input_tokens_seen": 11874928, "step": 20595 }, { "epoch": 3.068215668751862, "grad_norm": 0.32661956548690796, "learning_rate": 4.3763929459788554e-05, "loss": 0.8621, "num_input_tokens_seen": 11877680, "step": 20600 }, { "epoch": 3.068960381292821, "grad_norm": 0.27191269397735596, "learning_rate": 4.375963435463853e-05, "loss": 0.8376, "num_input_tokens_seen": 11880336, "step": 20605 }, { "epoch": 3.06970509383378, "grad_norm": 0.25803038477897644, "learning_rate": 4.375533798178869e-05, "loss": 0.7949, "num_input_tokens_seen": 11883152, "step": 20610 }, { "epoch": 3.0704498063747394, "grad_norm": 0.31015321612358093, "learning_rate": 4.375104034152936e-05, "loss": 0.7835, "num_input_tokens_seen": 11886032, "step": 20615 }, { "epoch": 3.0711945189156986, "grad_norm": 0.2974760830402374, "learning_rate": 4.374674143415096e-05, "loss": 0.8447, "num_input_tokens_seen": 11889072, "step": 20620 }, { "epoch": 3.071939231456658, "grad_norm": 0.29193463921546936, "learning_rate": 4.374244125994399e-05, "loss": 0.8344, "num_input_tokens_seen": 11892336, "step": 20625 }, { "epoch": 3.072683943997617, "grad_norm": 0.324176549911499, "learning_rate": 4.3738139819199045e-05, "loss": 0.8287, "num_input_tokens_seen": 11895408, "step": 20630 }, { "epoch": 3.073428656538576, "grad_norm": 0.29959940910339355, "learning_rate": 4.3733837112206786e-05, "loss": 0.7945, "num_input_tokens_seen": 11898448, "step": 20635 }, { "epoch": 3.0741733690795354, "grad_norm": 0.32371190190315247, "learning_rate": 4.372953313925798e-05, "loss": 0.8206, "num_input_tokens_seen": 11901424, "step": 20640 }, { "epoch": 3.0749180816204946, "grad_norm": 0.27002108097076416, "learning_rate": 4.3725227900643485e-05, "loss": 0.795, "num_input_tokens_seen": 11904496, "step": 20645 }, { "epoch": 3.075662794161454, "grad_norm": 0.2753315567970276, "learning_rate": 4.372092139665422e-05, "loss": 0.8164, "num_input_tokens_seen": 11907152, "step": 20650 }, { "epoch": 3.076407506702413, "grad_norm": 0.2824914753437042, "learning_rate": 4.3716613627581195e-05, "loss": 0.8315, "num_input_tokens_seen": 11909776, "step": 20655 }, { "epoch": 3.0771522192433722, "grad_norm": 0.37373119592666626, "learning_rate": 4.3712304593715516e-05, "loss": 0.7821, "num_input_tokens_seen": 11912624, "step": 20660 }, { "epoch": 3.0778969317843314, "grad_norm": 0.24205411970615387, "learning_rate": 4.3707994295348374e-05, "loss": 0.788, "num_input_tokens_seen": 11915504, "step": 20665 }, { "epoch": 3.0786416443252906, "grad_norm": 0.2773110866546631, "learning_rate": 4.370368273277103e-05, "loss": 0.8201, "num_input_tokens_seen": 11918736, "step": 20670 }, { "epoch": 3.07938635686625, "grad_norm": 0.27062854170799255, "learning_rate": 4.3699369906274864e-05, "loss": 0.788, "num_input_tokens_seen": 11921776, "step": 20675 }, { "epoch": 3.0801310694072086, "grad_norm": 0.410636842250824, "learning_rate": 4.3695055816151296e-05, "loss": 0.8143, "num_input_tokens_seen": 11924784, "step": 20680 }, { "epoch": 3.080875781948168, "grad_norm": 0.2685414254665375, "learning_rate": 4.369074046269187e-05, "loss": 0.8071, "num_input_tokens_seen": 11927728, "step": 20685 }, { "epoch": 3.081620494489127, "grad_norm": 0.26705148816108704, "learning_rate": 4.3686423846188196e-05, "loss": 0.8012, "num_input_tokens_seen": 11930768, "step": 20690 }, { "epoch": 3.082365207030086, "grad_norm": 0.22258897125720978, "learning_rate": 4.368210596693197e-05, "loss": 0.7918, "num_input_tokens_seen": 11933744, "step": 20695 }, { "epoch": 3.0831099195710454, "grad_norm": 0.25195497274398804, "learning_rate": 4.367778682521498e-05, "loss": 0.8162, "num_input_tokens_seen": 11936560, "step": 20700 }, { "epoch": 3.0838546321120046, "grad_norm": 0.2544466257095337, "learning_rate": 4.367346642132909e-05, "loss": 0.7742, "num_input_tokens_seen": 11939760, "step": 20705 }, { "epoch": 3.084599344652964, "grad_norm": 0.25051671266555786, "learning_rate": 4.366914475556626e-05, "loss": 0.8035, "num_input_tokens_seen": 11942608, "step": 20710 }, { "epoch": 3.085344057193923, "grad_norm": 0.3082605004310608, "learning_rate": 4.3664821828218536e-05, "loss": 0.8229, "num_input_tokens_seen": 11945360, "step": 20715 }, { "epoch": 3.086088769734882, "grad_norm": 0.29075685143470764, "learning_rate": 4.3660497639578036e-05, "loss": 0.8023, "num_input_tokens_seen": 11948016, "step": 20720 }, { "epoch": 3.0868334822758414, "grad_norm": 0.3261756896972656, "learning_rate": 4.3656172189936975e-05, "loss": 0.8226, "num_input_tokens_seen": 11951056, "step": 20725 }, { "epoch": 3.0875781948168006, "grad_norm": 0.2318810373544693, "learning_rate": 4.3651845479587647e-05, "loss": 0.7925, "num_input_tokens_seen": 11954000, "step": 20730 }, { "epoch": 3.08832290735776, "grad_norm": 0.270069420337677, "learning_rate": 4.3647517508822434e-05, "loss": 0.7584, "num_input_tokens_seen": 11956592, "step": 20735 }, { "epoch": 3.089067619898719, "grad_norm": 0.2909466028213501, "learning_rate": 4.36431882779338e-05, "loss": 0.8302, "num_input_tokens_seen": 11959376, "step": 20740 }, { "epoch": 3.0898123324396782, "grad_norm": 0.3081477880477905, "learning_rate": 4.3638857787214304e-05, "loss": 0.7952, "num_input_tokens_seen": 11962032, "step": 20745 }, { "epoch": 3.0905570449806374, "grad_norm": 0.31635063886642456, "learning_rate": 4.363452603695658e-05, "loss": 0.7793, "num_input_tokens_seen": 11965072, "step": 20750 }, { "epoch": 3.0913017575215966, "grad_norm": 0.3142648935317993, "learning_rate": 4.363019302745334e-05, "loss": 0.7601, "num_input_tokens_seen": 11968208, "step": 20755 }, { "epoch": 3.092046470062556, "grad_norm": 0.2747397720813751, "learning_rate": 4.362585875899741e-05, "loss": 0.8291, "num_input_tokens_seen": 11971088, "step": 20760 }, { "epoch": 3.092791182603515, "grad_norm": 0.30933108925819397, "learning_rate": 4.3621523231881665e-05, "loss": 0.8403, "num_input_tokens_seen": 11974128, "step": 20765 }, { "epoch": 3.0935358951444742, "grad_norm": 0.2648871839046478, "learning_rate": 4.36171864463991e-05, "loss": 0.8057, "num_input_tokens_seen": 11976976, "step": 20770 }, { "epoch": 3.0942806076854334, "grad_norm": 0.2854790687561035, "learning_rate": 4.361284840284275e-05, "loss": 0.8185, "num_input_tokens_seen": 11979728, "step": 20775 }, { "epoch": 3.0950253202263927, "grad_norm": 0.32793697714805603, "learning_rate": 4.36085091015058e-05, "loss": 0.8486, "num_input_tokens_seen": 11982672, "step": 20780 }, { "epoch": 3.095770032767352, "grad_norm": 0.2533290386199951, "learning_rate": 4.3604168542681444e-05, "loss": 0.8067, "num_input_tokens_seen": 11985360, "step": 20785 }, { "epoch": 3.096514745308311, "grad_norm": 0.3175499141216278, "learning_rate": 4.3599826726663026e-05, "loss": 0.8072, "num_input_tokens_seen": 11988208, "step": 20790 }, { "epoch": 3.0972594578492703, "grad_norm": 0.20480675995349884, "learning_rate": 4.359548365374394e-05, "loss": 0.8157, "num_input_tokens_seen": 11991120, "step": 20795 }, { "epoch": 3.0980041703902295, "grad_norm": 0.2566295564174652, "learning_rate": 4.3591139324217666e-05, "loss": 0.8031, "num_input_tokens_seen": 11993904, "step": 20800 }, { "epoch": 3.0987488829311887, "grad_norm": 0.35226383805274963, "learning_rate": 4.3586793738377785e-05, "loss": 0.8157, "num_input_tokens_seen": 11996848, "step": 20805 }, { "epoch": 3.099493595472148, "grad_norm": 0.2574499845504761, "learning_rate": 4.358244689651795e-05, "loss": 0.8225, "num_input_tokens_seen": 11999632, "step": 20810 }, { "epoch": 3.100238308013107, "grad_norm": 0.3958663046360016, "learning_rate": 4.357809879893191e-05, "loss": 0.7995, "num_input_tokens_seen": 12002224, "step": 20815 }, { "epoch": 3.1009830205540663, "grad_norm": 0.3059748709201813, "learning_rate": 4.357374944591348e-05, "loss": 0.7916, "num_input_tokens_seen": 12005072, "step": 20820 }, { "epoch": 3.1017277330950255, "grad_norm": 0.27388837933540344, "learning_rate": 4.3569398837756586e-05, "loss": 0.7846, "num_input_tokens_seen": 12007728, "step": 20825 }, { "epoch": 3.1024724456359847, "grad_norm": 0.2319963574409485, "learning_rate": 4.356504697475521e-05, "loss": 0.8317, "num_input_tokens_seen": 12010448, "step": 20830 }, { "epoch": 3.103217158176944, "grad_norm": 0.28567513823509216, "learning_rate": 4.356069385720344e-05, "loss": 0.799, "num_input_tokens_seen": 12013392, "step": 20835 }, { "epoch": 3.103961870717903, "grad_norm": 0.25359684228897095, "learning_rate": 4.3556339485395444e-05, "loss": 0.7773, "num_input_tokens_seen": 12016208, "step": 20840 }, { "epoch": 3.1047065832588623, "grad_norm": 0.3341038227081299, "learning_rate": 4.355198385962547e-05, "loss": 0.8312, "num_input_tokens_seen": 12019152, "step": 20845 }, { "epoch": 3.1054512957998215, "grad_norm": 0.2960885465145111, "learning_rate": 4.354762698018785e-05, "loss": 0.7943, "num_input_tokens_seen": 12021936, "step": 20850 }, { "epoch": 3.1061960083407802, "grad_norm": 0.21448631584644318, "learning_rate": 4.3543268847377005e-05, "loss": 0.8218, "num_input_tokens_seen": 12024944, "step": 20855 }, { "epoch": 3.1069407208817394, "grad_norm": 0.2406311333179474, "learning_rate": 4.353890946148745e-05, "loss": 0.8187, "num_input_tokens_seen": 12027696, "step": 20860 }, { "epoch": 3.1076854334226987, "grad_norm": 0.24230077862739563, "learning_rate": 4.353454882281377e-05, "loss": 0.7882, "num_input_tokens_seen": 12030832, "step": 20865 }, { "epoch": 3.108430145963658, "grad_norm": 0.2568880617618561, "learning_rate": 4.353018693165063e-05, "loss": 0.7894, "num_input_tokens_seen": 12033552, "step": 20870 }, { "epoch": 3.109174858504617, "grad_norm": 0.22272421419620514, "learning_rate": 4.35258237882928e-05, "loss": 0.8066, "num_input_tokens_seen": 12036080, "step": 20875 }, { "epoch": 3.1099195710455763, "grad_norm": 0.2429613173007965, "learning_rate": 4.352145939303511e-05, "loss": 0.78, "num_input_tokens_seen": 12038960, "step": 20880 }, { "epoch": 3.1106642835865355, "grad_norm": 0.2875436842441559, "learning_rate": 4.35170937461725e-05, "loss": 0.7991, "num_input_tokens_seen": 12042032, "step": 20885 }, { "epoch": 3.1114089961274947, "grad_norm": 0.25841036438941956, "learning_rate": 4.3512726847999987e-05, "loss": 0.826, "num_input_tokens_seen": 12044688, "step": 20890 }, { "epoch": 3.112153708668454, "grad_norm": 0.2479134202003479, "learning_rate": 4.3508358698812654e-05, "loss": 0.8004, "num_input_tokens_seen": 12047344, "step": 20895 }, { "epoch": 3.112898421209413, "grad_norm": 0.23995770514011383, "learning_rate": 4.350398929890569e-05, "loss": 0.7993, "num_input_tokens_seen": 12050256, "step": 20900 }, { "epoch": 3.1136431337503723, "grad_norm": 0.1803126186132431, "learning_rate": 4.349961864857436e-05, "loss": 0.8127, "num_input_tokens_seen": 12053040, "step": 20905 }, { "epoch": 3.1143878462913315, "grad_norm": 0.2647070288658142, "learning_rate": 4.349524674811403e-05, "loss": 0.8037, "num_input_tokens_seen": 12055920, "step": 20910 }, { "epoch": 3.1151325588322907, "grad_norm": 0.25577878952026367, "learning_rate": 4.3490873597820106e-05, "loss": 0.8054, "num_input_tokens_seen": 12059216, "step": 20915 }, { "epoch": 3.11587727137325, "grad_norm": 0.30899691581726074, "learning_rate": 4.3486499197988126e-05, "loss": 0.7933, "num_input_tokens_seen": 12062160, "step": 20920 }, { "epoch": 3.116621983914209, "grad_norm": 0.2737640142440796, "learning_rate": 4.348212354891369e-05, "loss": 0.7861, "num_input_tokens_seen": 12065040, "step": 20925 }, { "epoch": 3.1173666964551683, "grad_norm": 0.2814925014972687, "learning_rate": 4.347774665089248e-05, "loss": 0.8238, "num_input_tokens_seen": 12068176, "step": 20930 }, { "epoch": 3.1181114089961275, "grad_norm": 0.3276169002056122, "learning_rate": 4.347336850422029e-05, "loss": 0.8098, "num_input_tokens_seen": 12070928, "step": 20935 }, { "epoch": 3.1188561215370867, "grad_norm": 0.222854882478714, "learning_rate": 4.346898910919296e-05, "loss": 0.7666, "num_input_tokens_seen": 12073776, "step": 20940 }, { "epoch": 3.119600834078046, "grad_norm": 0.30219635367393494, "learning_rate": 4.346460846610643e-05, "loss": 0.8118, "num_input_tokens_seen": 12076624, "step": 20945 }, { "epoch": 3.120345546619005, "grad_norm": 0.20848709344863892, "learning_rate": 4.346022657525673e-05, "loss": 0.7887, "num_input_tokens_seen": 12079728, "step": 20950 }, { "epoch": 3.1210902591599643, "grad_norm": 0.2962628901004791, "learning_rate": 4.345584343693998e-05, "loss": 0.7902, "num_input_tokens_seen": 12082672, "step": 20955 }, { "epoch": 3.1218349717009235, "grad_norm": 0.28966930508613586, "learning_rate": 4.345145905145237e-05, "loss": 0.8204, "num_input_tokens_seen": 12085680, "step": 20960 }, { "epoch": 3.1225796842418827, "grad_norm": 0.33590614795684814, "learning_rate": 4.344707341909017e-05, "loss": 0.8029, "num_input_tokens_seen": 12088432, "step": 20965 }, { "epoch": 3.123324396782842, "grad_norm": 0.29821646213531494, "learning_rate": 4.3442686540149744e-05, "loss": 0.7975, "num_input_tokens_seen": 12091600, "step": 20970 }, { "epoch": 3.124069109323801, "grad_norm": 0.2670862376689911, "learning_rate": 4.343829841492755e-05, "loss": 0.7807, "num_input_tokens_seen": 12094416, "step": 20975 }, { "epoch": 3.1248138218647603, "grad_norm": 0.22001099586486816, "learning_rate": 4.343390904372011e-05, "loss": 0.7785, "num_input_tokens_seen": 12097488, "step": 20980 }, { "epoch": 3.1255585344057195, "grad_norm": 0.35262390971183777, "learning_rate": 4.3429518426824047e-05, "loss": 0.8359, "num_input_tokens_seen": 12100560, "step": 20985 }, { "epoch": 3.1263032469466787, "grad_norm": 0.2936501204967499, "learning_rate": 4.342512656453606e-05, "loss": 0.7629, "num_input_tokens_seen": 12103248, "step": 20990 }, { "epoch": 3.127047959487638, "grad_norm": 0.18432947993278503, "learning_rate": 4.342073345715292e-05, "loss": 0.7581, "num_input_tokens_seen": 12105776, "step": 20995 }, { "epoch": 3.127792672028597, "grad_norm": 0.4875721335411072, "learning_rate": 4.341633910497151e-05, "loss": 0.8135, "num_input_tokens_seen": 12108656, "step": 21000 }, { "epoch": 3.1285373845695563, "grad_norm": 0.450596421957016, "learning_rate": 4.3411943508288786e-05, "loss": 0.8345, "num_input_tokens_seen": 12111536, "step": 21005 }, { "epoch": 3.1292820971105155, "grad_norm": 0.32673922181129456, "learning_rate": 4.3407546667401776e-05, "loss": 0.8534, "num_input_tokens_seen": 12114704, "step": 21010 }, { "epoch": 3.1300268096514747, "grad_norm": 0.21906186640262604, "learning_rate": 4.34031485826076e-05, "loss": 0.7792, "num_input_tokens_seen": 12117488, "step": 21015 }, { "epoch": 3.1307715221924335, "grad_norm": 0.3375011384487152, "learning_rate": 4.339874925420347e-05, "loss": 0.8274, "num_input_tokens_seen": 12120304, "step": 21020 }, { "epoch": 3.131516234733393, "grad_norm": 0.22337570786476135, "learning_rate": 4.339434868248665e-05, "loss": 0.7757, "num_input_tokens_seen": 12123280, "step": 21025 }, { "epoch": 3.132260947274352, "grad_norm": 0.3893274962902069, "learning_rate": 4.3389946867754546e-05, "loss": 0.8352, "num_input_tokens_seen": 12125968, "step": 21030 }, { "epoch": 3.133005659815311, "grad_norm": 0.3476792573928833, "learning_rate": 4.338554381030459e-05, "loss": 0.8399, "num_input_tokens_seen": 12128784, "step": 21035 }, { "epoch": 3.1337503723562703, "grad_norm": 0.2980321943759918, "learning_rate": 4.338113951043436e-05, "loss": 0.8066, "num_input_tokens_seen": 12131632, "step": 21040 }, { "epoch": 3.1344950848972295, "grad_norm": 0.26750150322914124, "learning_rate": 4.337673396844143e-05, "loss": 0.814, "num_input_tokens_seen": 12134512, "step": 21045 }, { "epoch": 3.1352397974381887, "grad_norm": 0.24202653765678406, "learning_rate": 4.337232718462354e-05, "loss": 0.8012, "num_input_tokens_seen": 12137744, "step": 21050 }, { "epoch": 3.135984509979148, "grad_norm": 0.27477365732192993, "learning_rate": 4.336791915927847e-05, "loss": 0.8054, "num_input_tokens_seen": 12140528, "step": 21055 }, { "epoch": 3.136729222520107, "grad_norm": 0.33115315437316895, "learning_rate": 4.3363509892704114e-05, "loss": 0.7889, "num_input_tokens_seen": 12143376, "step": 21060 }, { "epoch": 3.1374739350610663, "grad_norm": 0.19016803801059723, "learning_rate": 4.335909938519841e-05, "loss": 0.8151, "num_input_tokens_seen": 12146064, "step": 21065 }, { "epoch": 3.1382186476020255, "grad_norm": 0.3193868100643158, "learning_rate": 4.3354687637059414e-05, "loss": 0.7691, "num_input_tokens_seen": 12149328, "step": 21070 }, { "epoch": 3.1389633601429847, "grad_norm": 0.30547890067100525, "learning_rate": 4.335027464858526e-05, "loss": 0.8061, "num_input_tokens_seen": 12152048, "step": 21075 }, { "epoch": 3.139708072683944, "grad_norm": 0.26117533445358276, "learning_rate": 4.334586042007414e-05, "loss": 0.7929, "num_input_tokens_seen": 12154864, "step": 21080 }, { "epoch": 3.140452785224903, "grad_norm": 0.23499447107315063, "learning_rate": 4.3341444951824365e-05, "loss": 0.8037, "num_input_tokens_seen": 12158032, "step": 21085 }, { "epoch": 3.1411974977658623, "grad_norm": 0.35866180062294006, "learning_rate": 4.3337028244134315e-05, "loss": 0.8105, "num_input_tokens_seen": 12160656, "step": 21090 }, { "epoch": 3.1419422103068215, "grad_norm": 0.2567020356655121, "learning_rate": 4.3332610297302445e-05, "loss": 0.7997, "num_input_tokens_seen": 12163376, "step": 21095 }, { "epoch": 3.1426869228477807, "grad_norm": 0.2692365348339081, "learning_rate": 4.3328191111627306e-05, "loss": 0.8012, "num_input_tokens_seen": 12166192, "step": 21100 }, { "epoch": 3.14343163538874, "grad_norm": 0.26206159591674805, "learning_rate": 4.332377068740753e-05, "loss": 0.808, "num_input_tokens_seen": 12169360, "step": 21105 }, { "epoch": 3.144176347929699, "grad_norm": 0.3368852734565735, "learning_rate": 4.331934902494184e-05, "loss": 0.7928, "num_input_tokens_seen": 12171984, "step": 21110 }, { "epoch": 3.1449210604706583, "grad_norm": 0.2206275314092636, "learning_rate": 4.331492612452901e-05, "loss": 0.8269, "num_input_tokens_seen": 12174576, "step": 21115 }, { "epoch": 3.1456657730116175, "grad_norm": 0.29384323954582214, "learning_rate": 4.331050198646794e-05, "loss": 0.8134, "num_input_tokens_seen": 12177296, "step": 21120 }, { "epoch": 3.1464104855525767, "grad_norm": 0.2747700810432434, "learning_rate": 4.330607661105759e-05, "loss": 0.8116, "num_input_tokens_seen": 12180304, "step": 21125 }, { "epoch": 3.147155198093536, "grad_norm": 0.4795995056629181, "learning_rate": 4.330164999859702e-05, "loss": 0.8254, "num_input_tokens_seen": 12183408, "step": 21130 }, { "epoch": 3.147899910634495, "grad_norm": 0.2699768841266632, "learning_rate": 4.3297222149385336e-05, "loss": 0.8118, "num_input_tokens_seen": 12186192, "step": 21135 }, { "epoch": 3.1486446231754543, "grad_norm": 0.35231101512908936, "learning_rate": 4.329279306372178e-05, "loss": 0.8379, "num_input_tokens_seen": 12189136, "step": 21140 }, { "epoch": 3.1493893357164136, "grad_norm": 0.2513883113861084, "learning_rate": 4.3288362741905635e-05, "loss": 0.7969, "num_input_tokens_seen": 12192176, "step": 21145 }, { "epoch": 3.1501340482573728, "grad_norm": 0.24482952058315277, "learning_rate": 4.32839311842363e-05, "loss": 0.8064, "num_input_tokens_seen": 12195056, "step": 21150 }, { "epoch": 3.150878760798332, "grad_norm": 0.2827152609825134, "learning_rate": 4.327949839101323e-05, "loss": 0.7979, "num_input_tokens_seen": 12197712, "step": 21155 }, { "epoch": 3.151623473339291, "grad_norm": 0.26434171199798584, "learning_rate": 4.3275064362535966e-05, "loss": 0.8089, "num_input_tokens_seen": 12200944, "step": 21160 }, { "epoch": 3.1523681858802504, "grad_norm": 0.4290095269680023, "learning_rate": 4.327062909910417e-05, "loss": 0.7764, "num_input_tokens_seen": 12203472, "step": 21165 }, { "epoch": 3.1531128984212096, "grad_norm": 0.30495724081993103, "learning_rate": 4.326619260101753e-05, "loss": 0.8152, "num_input_tokens_seen": 12206256, "step": 21170 }, { "epoch": 3.1538576109621688, "grad_norm": 0.23668552935123444, "learning_rate": 4.326175486857587e-05, "loss": 0.8094, "num_input_tokens_seen": 12209200, "step": 21175 }, { "epoch": 3.154602323503128, "grad_norm": 0.2711987793445587, "learning_rate": 4.3257315902079055e-05, "loss": 0.8122, "num_input_tokens_seen": 12211888, "step": 21180 }, { "epoch": 3.155347036044087, "grad_norm": 0.35721224546432495, "learning_rate": 4.3252875701827064e-05, "loss": 0.7993, "num_input_tokens_seen": 12214800, "step": 21185 }, { "epoch": 3.1560917485850464, "grad_norm": 0.2927946150302887, "learning_rate": 4.324843426811994e-05, "loss": 0.8088, "num_input_tokens_seen": 12218032, "step": 21190 }, { "epoch": 3.156836461126005, "grad_norm": 0.2543589770793915, "learning_rate": 4.324399160125782e-05, "loss": 0.7936, "num_input_tokens_seen": 12220816, "step": 21195 }, { "epoch": 3.157581173666965, "grad_norm": 0.300353467464447, "learning_rate": 4.323954770154093e-05, "loss": 0.7739, "num_input_tokens_seen": 12223824, "step": 21200 }, { "epoch": 3.1583258862079235, "grad_norm": 0.2336316853761673, "learning_rate": 4.323510256926956e-05, "loss": 0.8226, "num_input_tokens_seen": 12226928, "step": 21205 }, { "epoch": 3.1590705987488827, "grad_norm": 0.3191935122013092, "learning_rate": 4.323065620474409e-05, "loss": 0.771, "num_input_tokens_seen": 12229808, "step": 21210 }, { "epoch": 3.159815311289842, "grad_norm": 0.21448540687561035, "learning_rate": 4.3226208608265e-05, "loss": 0.8017, "num_input_tokens_seen": 12232400, "step": 21215 }, { "epoch": 3.160560023830801, "grad_norm": 0.2752368152141571, "learning_rate": 4.322175978013283e-05, "loss": 0.7477, "num_input_tokens_seen": 12235344, "step": 21220 }, { "epoch": 3.1613047363717603, "grad_norm": 0.23090729117393494, "learning_rate": 4.321730972064823e-05, "loss": 0.855, "num_input_tokens_seen": 12238096, "step": 21225 }, { "epoch": 3.1620494489127196, "grad_norm": 0.4883398115634918, "learning_rate": 4.32128584301119e-05, "loss": 0.8213, "num_input_tokens_seen": 12241168, "step": 21230 }, { "epoch": 3.1627941614536788, "grad_norm": 0.2958584427833557, "learning_rate": 4.320840590882464e-05, "loss": 0.8436, "num_input_tokens_seen": 12244112, "step": 21235 }, { "epoch": 3.163538873994638, "grad_norm": 0.24695497751235962, "learning_rate": 4.320395215708734e-05, "loss": 0.7908, "num_input_tokens_seen": 12247440, "step": 21240 }, { "epoch": 3.164283586535597, "grad_norm": 0.3532020151615143, "learning_rate": 4.319949717520096e-05, "loss": 0.7887, "num_input_tokens_seen": 12250256, "step": 21245 }, { "epoch": 3.1650282990765564, "grad_norm": 0.2233837991952896, "learning_rate": 4.319504096346657e-05, "loss": 0.8066, "num_input_tokens_seen": 12252944, "step": 21250 }, { "epoch": 3.1657730116175156, "grad_norm": 0.19382695853710175, "learning_rate": 4.319058352218528e-05, "loss": 0.8225, "num_input_tokens_seen": 12255696, "step": 21255 }, { "epoch": 3.1665177241584748, "grad_norm": 0.23813068866729736, "learning_rate": 4.3186124851658305e-05, "loss": 0.831, "num_input_tokens_seen": 12258544, "step": 21260 }, { "epoch": 3.167262436699434, "grad_norm": 0.36933186650276184, "learning_rate": 4.318166495218696e-05, "loss": 0.8201, "num_input_tokens_seen": 12261808, "step": 21265 }, { "epoch": 3.168007149240393, "grad_norm": 0.25682172179222107, "learning_rate": 4.317720382407262e-05, "loss": 0.8216, "num_input_tokens_seen": 12264592, "step": 21270 }, { "epoch": 3.1687518617813524, "grad_norm": 0.20533137023448944, "learning_rate": 4.317274146761674e-05, "loss": 0.7738, "num_input_tokens_seen": 12267440, "step": 21275 }, { "epoch": 3.1694965743223116, "grad_norm": 0.2678854465484619, "learning_rate": 4.316827788312089e-05, "loss": 0.7912, "num_input_tokens_seen": 12270416, "step": 21280 }, { "epoch": 3.170241286863271, "grad_norm": 0.29146119952201843, "learning_rate": 4.316381307088668e-05, "loss": 0.7831, "num_input_tokens_seen": 12273424, "step": 21285 }, { "epoch": 3.17098599940423, "grad_norm": 0.1945275366306305, "learning_rate": 4.315934703121583e-05, "loss": 0.8022, "num_input_tokens_seen": 12276080, "step": 21290 }, { "epoch": 3.171730711945189, "grad_norm": 0.2792681157588959, "learning_rate": 4.315487976441014e-05, "loss": 0.7785, "num_input_tokens_seen": 12279152, "step": 21295 }, { "epoch": 3.1724754244861484, "grad_norm": 0.24596178531646729, "learning_rate": 4.3150411270771486e-05, "loss": 0.7864, "num_input_tokens_seen": 12282352, "step": 21300 }, { "epoch": 3.1732201370271076, "grad_norm": 0.2632717490196228, "learning_rate": 4.3145941550601836e-05, "loss": 0.8264, "num_input_tokens_seen": 12285040, "step": 21305 }, { "epoch": 3.173964849568067, "grad_norm": 0.1821284294128418, "learning_rate": 4.314147060420323e-05, "loss": 0.7994, "num_input_tokens_seen": 12288016, "step": 21310 }, { "epoch": 3.174709562109026, "grad_norm": 0.29440024495124817, "learning_rate": 4.31369984318778e-05, "loss": 0.7801, "num_input_tokens_seen": 12290896, "step": 21315 }, { "epoch": 3.175454274649985, "grad_norm": 0.30585354566574097, "learning_rate": 4.313252503392775e-05, "loss": 0.7966, "num_input_tokens_seen": 12293904, "step": 21320 }, { "epoch": 3.1761989871909444, "grad_norm": 0.23688462376594543, "learning_rate": 4.3128050410655384e-05, "loss": 0.7802, "num_input_tokens_seen": 12296784, "step": 21325 }, { "epoch": 3.1769436997319036, "grad_norm": 0.4010571539402008, "learning_rate": 4.312357456236308e-05, "loss": 0.7749, "num_input_tokens_seen": 12300112, "step": 21330 }, { "epoch": 3.177688412272863, "grad_norm": 0.3042450249195099, "learning_rate": 4.3119097489353285e-05, "loss": 0.8339, "num_input_tokens_seen": 12303120, "step": 21335 }, { "epoch": 3.178433124813822, "grad_norm": 0.31636881828308105, "learning_rate": 4.311461919192855e-05, "loss": 0.8078, "num_input_tokens_seen": 12306064, "step": 21340 }, { "epoch": 3.179177837354781, "grad_norm": 0.28546205163002014, "learning_rate": 4.31101396703915e-05, "loss": 0.8388, "num_input_tokens_seen": 12308912, "step": 21345 }, { "epoch": 3.1799225498957404, "grad_norm": 0.2806244492530823, "learning_rate": 4.310565892504484e-05, "loss": 0.791, "num_input_tokens_seen": 12311792, "step": 21350 }, { "epoch": 3.1806672624366996, "grad_norm": 0.17180326581001282, "learning_rate": 4.3101176956191365e-05, "loss": 0.7994, "num_input_tokens_seen": 12314544, "step": 21355 }, { "epoch": 3.181411974977659, "grad_norm": 0.3461175262928009, "learning_rate": 4.309669376413394e-05, "loss": 0.8236, "num_input_tokens_seen": 12317456, "step": 21360 }, { "epoch": 3.182156687518618, "grad_norm": 0.19535376131534576, "learning_rate": 4.309220934917553e-05, "loss": 0.8224, "num_input_tokens_seen": 12320400, "step": 21365 }, { "epoch": 3.182901400059577, "grad_norm": 0.22775623202323914, "learning_rate": 4.3087723711619166e-05, "loss": 0.8125, "num_input_tokens_seen": 12323120, "step": 21370 }, { "epoch": 3.1836461126005364, "grad_norm": 0.22519345581531525, "learning_rate": 4.3083236851767976e-05, "loss": 0.7977, "num_input_tokens_seen": 12325936, "step": 21375 }, { "epoch": 3.184390825141495, "grad_norm": 0.28216105699539185, "learning_rate": 4.307874876992516e-05, "loss": 0.7723, "num_input_tokens_seen": 12328816, "step": 21380 }, { "epoch": 3.1851355376824544, "grad_norm": 0.27125221490859985, "learning_rate": 4.307425946639401e-05, "loss": 0.8054, "num_input_tokens_seen": 12331856, "step": 21385 }, { "epoch": 3.1858802502234136, "grad_norm": 0.26103249192237854, "learning_rate": 4.3069768941477885e-05, "loss": 0.7909, "num_input_tokens_seen": 12334864, "step": 21390 }, { "epoch": 3.186624962764373, "grad_norm": 0.2608576714992523, "learning_rate": 4.3065277195480235e-05, "loss": 0.8024, "num_input_tokens_seen": 12337328, "step": 21395 }, { "epoch": 3.187369675305332, "grad_norm": 0.32601115107536316, "learning_rate": 4.306078422870461e-05, "loss": 0.8286, "num_input_tokens_seen": 12340400, "step": 21400 }, { "epoch": 3.188114387846291, "grad_norm": 0.2839157283306122, "learning_rate": 4.3056290041454615e-05, "loss": 0.7889, "num_input_tokens_seen": 12343280, "step": 21405 }, { "epoch": 3.1888591003872504, "grad_norm": 0.29613929986953735, "learning_rate": 4.3051794634033946e-05, "loss": 0.805, "num_input_tokens_seen": 12345936, "step": 21410 }, { "epoch": 3.1896038129282096, "grad_norm": 0.2715907394886017, "learning_rate": 4.304729800674639e-05, "loss": 0.8131, "num_input_tokens_seen": 12348880, "step": 21415 }, { "epoch": 3.190348525469169, "grad_norm": 0.26872292160987854, "learning_rate": 4.304280015989581e-05, "loss": 0.8155, "num_input_tokens_seen": 12351984, "step": 21420 }, { "epoch": 3.191093238010128, "grad_norm": 0.26290807127952576, "learning_rate": 4.303830109378616e-05, "loss": 0.8217, "num_input_tokens_seen": 12354896, "step": 21425 }, { "epoch": 3.191837950551087, "grad_norm": 0.3332463800907135, "learning_rate": 4.303380080872145e-05, "loss": 0.8198, "num_input_tokens_seen": 12357680, "step": 21430 }, { "epoch": 3.1925826630920464, "grad_norm": 0.3148413300514221, "learning_rate": 4.302929930500581e-05, "loss": 0.8407, "num_input_tokens_seen": 12360592, "step": 21435 }, { "epoch": 3.1933273756330056, "grad_norm": 0.2581261098384857, "learning_rate": 4.302479658294341e-05, "loss": 0.7867, "num_input_tokens_seen": 12363344, "step": 21440 }, { "epoch": 3.194072088173965, "grad_norm": 0.22554698586463928, "learning_rate": 4.3020292642838556e-05, "loss": 0.7825, "num_input_tokens_seen": 12366352, "step": 21445 }, { "epoch": 3.194816800714924, "grad_norm": 0.3071846663951874, "learning_rate": 4.301578748499558e-05, "loss": 0.8175, "num_input_tokens_seen": 12369200, "step": 21450 }, { "epoch": 3.1955615132558832, "grad_norm": 0.26263025403022766, "learning_rate": 4.301128110971895e-05, "loss": 0.8083, "num_input_tokens_seen": 12372208, "step": 21455 }, { "epoch": 3.1963062257968424, "grad_norm": 0.2060554474592209, "learning_rate": 4.300677351731315e-05, "loss": 0.8142, "num_input_tokens_seen": 12375152, "step": 21460 }, { "epoch": 3.1970509383378016, "grad_norm": 0.29096344113349915, "learning_rate": 4.300226470808282e-05, "loss": 0.8281, "num_input_tokens_seen": 12378096, "step": 21465 }, { "epoch": 3.197795650878761, "grad_norm": 0.24479590356349945, "learning_rate": 4.2997754682332626e-05, "loss": 0.788, "num_input_tokens_seen": 12381200, "step": 21470 }, { "epoch": 3.19854036341972, "grad_norm": 0.21027128398418427, "learning_rate": 4.2993243440367345e-05, "loss": 0.8253, "num_input_tokens_seen": 12383952, "step": 21475 }, { "epoch": 3.1992850759606792, "grad_norm": 0.19801682233810425, "learning_rate": 4.2988730982491824e-05, "loss": 0.7879, "num_input_tokens_seen": 12386672, "step": 21480 }, { "epoch": 3.2000297885016384, "grad_norm": 0.20102910697460175, "learning_rate": 4.2984217309011e-05, "loss": 0.8188, "num_input_tokens_seen": 12389520, "step": 21485 }, { "epoch": 3.2007745010425976, "grad_norm": 0.25484320521354675, "learning_rate": 4.2979702420229894e-05, "loss": 0.8059, "num_input_tokens_seen": 12392560, "step": 21490 }, { "epoch": 3.201519213583557, "grad_norm": 0.321513295173645, "learning_rate": 4.29751863164536e-05, "loss": 0.8121, "num_input_tokens_seen": 12395184, "step": 21495 }, { "epoch": 3.202263926124516, "grad_norm": 0.24958404898643494, "learning_rate": 4.2970668997987294e-05, "loss": 0.8168, "num_input_tokens_seen": 12398032, "step": 21500 }, { "epoch": 3.2030086386654752, "grad_norm": 0.24712051451206207, "learning_rate": 4.296615046513624e-05, "loss": 0.8177, "num_input_tokens_seen": 12401168, "step": 21505 }, { "epoch": 3.2037533512064345, "grad_norm": 0.3590177297592163, "learning_rate": 4.296163071820578e-05, "loss": 0.8085, "num_input_tokens_seen": 12404144, "step": 21510 }, { "epoch": 3.2044980637473937, "grad_norm": 0.2664335370063782, "learning_rate": 4.295710975750135e-05, "loss": 0.8324, "num_input_tokens_seen": 12407216, "step": 21515 }, { "epoch": 3.205242776288353, "grad_norm": 0.2877538502216339, "learning_rate": 4.295258758332845e-05, "loss": 0.8009, "num_input_tokens_seen": 12409904, "step": 21520 }, { "epoch": 3.205987488829312, "grad_norm": 0.2944648563861847, "learning_rate": 4.294806419599267e-05, "loss": 0.7947, "num_input_tokens_seen": 12413008, "step": 21525 }, { "epoch": 3.2067322013702713, "grad_norm": 0.20783182978630066, "learning_rate": 4.2943539595799675e-05, "loss": 0.8116, "num_input_tokens_seen": 12415888, "step": 21530 }, { "epoch": 3.2074769139112305, "grad_norm": 0.2902829349040985, "learning_rate": 4.293901378305523e-05, "loss": 0.7854, "num_input_tokens_seen": 12418800, "step": 21535 }, { "epoch": 3.2082216264521897, "grad_norm": 0.25818827748298645, "learning_rate": 4.2934486758065176e-05, "loss": 0.7972, "num_input_tokens_seen": 12421680, "step": 21540 }, { "epoch": 3.2089663389931484, "grad_norm": 0.2378757745027542, "learning_rate": 4.292995852113542e-05, "loss": 0.8072, "num_input_tokens_seen": 12424816, "step": 21545 }, { "epoch": 3.2097110515341076, "grad_norm": 0.2913832664489746, "learning_rate": 4.292542907257196e-05, "loss": 0.7989, "num_input_tokens_seen": 12427472, "step": 21550 }, { "epoch": 3.210455764075067, "grad_norm": 0.27919313311576843, "learning_rate": 4.292089841268089e-05, "loss": 0.8136, "num_input_tokens_seen": 12430480, "step": 21555 }, { "epoch": 3.211200476616026, "grad_norm": 0.22730137407779694, "learning_rate": 4.291636654176836e-05, "loss": 0.8023, "num_input_tokens_seen": 12433360, "step": 21560 }, { "epoch": 3.2119451891569852, "grad_norm": 0.37117645144462585, "learning_rate": 4.291183346014063e-05, "loss": 0.7737, "num_input_tokens_seen": 12436336, "step": 21565 }, { "epoch": 3.2126899016979444, "grad_norm": 0.26038745045661926, "learning_rate": 4.290729916810401e-05, "loss": 0.7958, "num_input_tokens_seen": 12439184, "step": 21570 }, { "epoch": 3.2134346142389036, "grad_norm": 0.24227434396743774, "learning_rate": 4.290276366596492e-05, "loss": 0.8153, "num_input_tokens_seen": 12442000, "step": 21575 }, { "epoch": 3.214179326779863, "grad_norm": 0.3265063762664795, "learning_rate": 4.2898226954029844e-05, "loss": 0.8209, "num_input_tokens_seen": 12444720, "step": 21580 }, { "epoch": 3.214924039320822, "grad_norm": 0.22084170579910278, "learning_rate": 4.289368903260536e-05, "loss": 0.7888, "num_input_tokens_seen": 12447440, "step": 21585 }, { "epoch": 3.2156687518617812, "grad_norm": 0.2294529378414154, "learning_rate": 4.288914990199814e-05, "loss": 0.7747, "num_input_tokens_seen": 12450256, "step": 21590 }, { "epoch": 3.2164134644027405, "grad_norm": 0.22334009408950806, "learning_rate": 4.288460956251489e-05, "loss": 0.8079, "num_input_tokens_seen": 12453136, "step": 21595 }, { "epoch": 3.2171581769436997, "grad_norm": 0.2985018193721771, "learning_rate": 4.288006801446243e-05, "loss": 0.8357, "num_input_tokens_seen": 12456272, "step": 21600 }, { "epoch": 3.217902889484659, "grad_norm": 0.2693454623222351, "learning_rate": 4.287552525814768e-05, "loss": 0.8245, "num_input_tokens_seen": 12458992, "step": 21605 }, { "epoch": 3.218647602025618, "grad_norm": 0.23900596797466278, "learning_rate": 4.2870981293877605e-05, "loss": 0.787, "num_input_tokens_seen": 12461936, "step": 21610 }, { "epoch": 3.2193923145665773, "grad_norm": 0.2664697766304016, "learning_rate": 4.286643612195927e-05, "loss": 0.8147, "num_input_tokens_seen": 12464656, "step": 21615 }, { "epoch": 3.2201370271075365, "grad_norm": 0.2478775978088379, "learning_rate": 4.286188974269983e-05, "loss": 0.8426, "num_input_tokens_seen": 12467600, "step": 21620 }, { "epoch": 3.2208817396484957, "grad_norm": 0.2834426760673523, "learning_rate": 4.28573421564065e-05, "loss": 0.842, "num_input_tokens_seen": 12470512, "step": 21625 }, { "epoch": 3.221626452189455, "grad_norm": 0.2542065680027008, "learning_rate": 4.2852793363386585e-05, "loss": 0.8133, "num_input_tokens_seen": 12473328, "step": 21630 }, { "epoch": 3.222371164730414, "grad_norm": 0.19155217707157135, "learning_rate": 4.2848243363947484e-05, "loss": 0.7716, "num_input_tokens_seen": 12476016, "step": 21635 }, { "epoch": 3.2231158772713733, "grad_norm": 0.4476430416107178, "learning_rate": 4.2843692158396655e-05, "loss": 0.801, "num_input_tokens_seen": 12479152, "step": 21640 }, { "epoch": 3.2238605898123325, "grad_norm": 0.35444074869155884, "learning_rate": 4.283913974704166e-05, "loss": 0.804, "num_input_tokens_seen": 12481840, "step": 21645 }, { "epoch": 3.2246053023532917, "grad_norm": 0.20199552178382874, "learning_rate": 4.283458613019013e-05, "loss": 0.8057, "num_input_tokens_seen": 12484880, "step": 21650 }, { "epoch": 3.225350014894251, "grad_norm": 0.2519441843032837, "learning_rate": 4.283003130814978e-05, "loss": 0.7897, "num_input_tokens_seen": 12487536, "step": 21655 }, { "epoch": 3.22609472743521, "grad_norm": 0.34203994274139404, "learning_rate": 4.2825475281228406e-05, "loss": 0.8291, "num_input_tokens_seen": 12490320, "step": 21660 }, { "epoch": 3.2268394399761693, "grad_norm": 0.23657526075839996, "learning_rate": 4.282091804973388e-05, "loss": 0.8302, "num_input_tokens_seen": 12493200, "step": 21665 }, { "epoch": 3.2275841525171285, "grad_norm": 0.25011974573135376, "learning_rate": 4.2816359613974176e-05, "loss": 0.7896, "num_input_tokens_seen": 12495664, "step": 21670 }, { "epoch": 3.2283288650580877, "grad_norm": 0.30563420057296753, "learning_rate": 4.281179997425732e-05, "loss": 0.8225, "num_input_tokens_seen": 12498864, "step": 21675 }, { "epoch": 3.229073577599047, "grad_norm": 0.2927817404270172, "learning_rate": 4.280723913089144e-05, "loss": 0.7857, "num_input_tokens_seen": 12501648, "step": 21680 }, { "epoch": 3.229818290140006, "grad_norm": 0.28998231887817383, "learning_rate": 4.280267708418474e-05, "loss": 0.8076, "num_input_tokens_seen": 12504464, "step": 21685 }, { "epoch": 3.2305630026809653, "grad_norm": 0.20278100669384003, "learning_rate": 4.279811383444551e-05, "loss": 0.8298, "num_input_tokens_seen": 12507664, "step": 21690 }, { "epoch": 3.2313077152219245, "grad_norm": 0.22715944051742554, "learning_rate": 4.2793549381982095e-05, "loss": 0.7842, "num_input_tokens_seen": 12510320, "step": 21695 }, { "epoch": 3.2320524277628837, "grad_norm": 0.3464692533016205, "learning_rate": 4.278898372710296e-05, "loss": 0.8233, "num_input_tokens_seen": 12513136, "step": 21700 }, { "epoch": 3.232797140303843, "grad_norm": 0.197373166680336, "learning_rate": 4.2784416870116635e-05, "loss": 0.8143, "num_input_tokens_seen": 12515664, "step": 21705 }, { "epoch": 3.233541852844802, "grad_norm": 0.36160391569137573, "learning_rate": 4.2779848811331726e-05, "loss": 0.8172, "num_input_tokens_seen": 12518512, "step": 21710 }, { "epoch": 3.2342865653857613, "grad_norm": 0.29329532384872437, "learning_rate": 4.2775279551056914e-05, "loss": 0.8309, "num_input_tokens_seen": 12521232, "step": 21715 }, { "epoch": 3.23503127792672, "grad_norm": 0.32694604992866516, "learning_rate": 4.277070908960098e-05, "loss": 0.8088, "num_input_tokens_seen": 12523888, "step": 21720 }, { "epoch": 3.2357759904676793, "grad_norm": 0.24267491698265076, "learning_rate": 4.276613742727278e-05, "loss": 0.7846, "num_input_tokens_seen": 12526832, "step": 21725 }, { "epoch": 3.2365207030086385, "grad_norm": 0.2519626319408417, "learning_rate": 4.276156456438124e-05, "loss": 0.8014, "num_input_tokens_seen": 12529744, "step": 21730 }, { "epoch": 3.2372654155495977, "grad_norm": 0.24767722189426422, "learning_rate": 4.275699050123538e-05, "loss": 0.8007, "num_input_tokens_seen": 12532720, "step": 21735 }, { "epoch": 3.238010128090557, "grad_norm": 0.2282961755990982, "learning_rate": 4.27524152381443e-05, "loss": 0.798, "num_input_tokens_seen": 12535504, "step": 21740 }, { "epoch": 3.238754840631516, "grad_norm": 0.29996418952941895, "learning_rate": 4.2747838775417174e-05, "loss": 0.7874, "num_input_tokens_seen": 12538768, "step": 21745 }, { "epoch": 3.2394995531724753, "grad_norm": 0.3162810802459717, "learning_rate": 4.2743261113363266e-05, "loss": 0.8249, "num_input_tokens_seen": 12541616, "step": 21750 }, { "epoch": 3.2402442657134345, "grad_norm": 0.30213457345962524, "learning_rate": 4.27386822522919e-05, "loss": 0.7787, "num_input_tokens_seen": 12544400, "step": 21755 }, { "epoch": 3.2409889782543937, "grad_norm": 0.2493450790643692, "learning_rate": 4.273410219251252e-05, "loss": 0.7867, "num_input_tokens_seen": 12547280, "step": 21760 }, { "epoch": 3.241733690795353, "grad_norm": 0.3144781291484833, "learning_rate": 4.27295209343346e-05, "loss": 0.785, "num_input_tokens_seen": 12550224, "step": 21765 }, { "epoch": 3.242478403336312, "grad_norm": 0.2362293154001236, "learning_rate": 4.2724938478067746e-05, "loss": 0.8042, "num_input_tokens_seen": 12553072, "step": 21770 }, { "epoch": 3.2432231158772713, "grad_norm": 0.2146168053150177, "learning_rate": 4.2720354824021616e-05, "loss": 0.8032, "num_input_tokens_seen": 12556016, "step": 21775 }, { "epoch": 3.2439678284182305, "grad_norm": 0.28152838349342346, "learning_rate": 4.271576997250595e-05, "loss": 0.809, "num_input_tokens_seen": 12558704, "step": 21780 }, { "epoch": 3.2447125409591897, "grad_norm": 0.28256943821907043, "learning_rate": 4.271118392383058e-05, "loss": 0.7779, "num_input_tokens_seen": 12561808, "step": 21785 }, { "epoch": 3.245457253500149, "grad_norm": 0.22350066900253296, "learning_rate": 4.2706596678305405e-05, "loss": 0.7859, "num_input_tokens_seen": 12564912, "step": 21790 }, { "epoch": 3.246201966041108, "grad_norm": 0.32531023025512695, "learning_rate": 4.2702008236240424e-05, "loss": 0.8191, "num_input_tokens_seen": 12567760, "step": 21795 }, { "epoch": 3.2469466785820673, "grad_norm": 0.2531544268131256, "learning_rate": 4.269741859794568e-05, "loss": 0.7736, "num_input_tokens_seen": 12570512, "step": 21800 }, { "epoch": 3.2476913911230265, "grad_norm": 0.342607706785202, "learning_rate": 4.2692827763731356e-05, "loss": 0.8255, "num_input_tokens_seen": 12573360, "step": 21805 }, { "epoch": 3.2484361036639857, "grad_norm": 0.22648461163043976, "learning_rate": 4.268823573390766e-05, "loss": 0.7942, "num_input_tokens_seen": 12575984, "step": 21810 }, { "epoch": 3.249180816204945, "grad_norm": 0.3056231439113617, "learning_rate": 4.26836425087849e-05, "loss": 0.8486, "num_input_tokens_seen": 12578800, "step": 21815 }, { "epoch": 3.249925528745904, "grad_norm": 0.25880905985832214, "learning_rate": 4.267904808867349e-05, "loss": 0.7853, "num_input_tokens_seen": 12581424, "step": 21820 }, { "epoch": 3.2506702412868633, "grad_norm": 0.3350611925125122, "learning_rate": 4.267445247388389e-05, "loss": 0.7833, "num_input_tokens_seen": 12584464, "step": 21825 }, { "epoch": 3.2514149538278225, "grad_norm": 0.24014565348625183, "learning_rate": 4.2669855664726635e-05, "loss": 0.8297, "num_input_tokens_seen": 12587472, "step": 21830 }, { "epoch": 3.2521596663687817, "grad_norm": 0.2756428122520447, "learning_rate": 4.266525766151238e-05, "loss": 0.7996, "num_input_tokens_seen": 12590288, "step": 21835 }, { "epoch": 3.252904378909741, "grad_norm": 0.25913041830062866, "learning_rate": 4.266065846455184e-05, "loss": 0.8203, "num_input_tokens_seen": 12593296, "step": 21840 }, { "epoch": 3.2536490914507, "grad_norm": 0.2813302278518677, "learning_rate": 4.26560580741558e-05, "loss": 0.8002, "num_input_tokens_seen": 12596144, "step": 21845 }, { "epoch": 3.2543938039916593, "grad_norm": 0.20611998438835144, "learning_rate": 4.2651456490635144e-05, "loss": 0.784, "num_input_tokens_seen": 12599152, "step": 21850 }, { "epoch": 3.2551385165326185, "grad_norm": 0.22202648222446442, "learning_rate": 4.2646853714300816e-05, "loss": 0.8128, "num_input_tokens_seen": 12601904, "step": 21855 }, { "epoch": 3.2558832290735777, "grad_norm": 0.34706026315689087, "learning_rate": 4.264224974546387e-05, "loss": 0.8507, "num_input_tokens_seen": 12604880, "step": 21860 }, { "epoch": 3.256627941614537, "grad_norm": 0.27786245942115784, "learning_rate": 4.263764458443541e-05, "loss": 0.8329, "num_input_tokens_seen": 12607920, "step": 21865 }, { "epoch": 3.257372654155496, "grad_norm": 0.2692718803882599, "learning_rate": 4.263303823152663e-05, "loss": 0.784, "num_input_tokens_seen": 12610960, "step": 21870 }, { "epoch": 3.2581173666964554, "grad_norm": 0.2929164469242096, "learning_rate": 4.262843068704883e-05, "loss": 0.8261, "num_input_tokens_seen": 12613840, "step": 21875 }, { "epoch": 3.2588620792374146, "grad_norm": 0.32098689675331116, "learning_rate": 4.262382195131335e-05, "loss": 0.7936, "num_input_tokens_seen": 12616688, "step": 21880 }, { "epoch": 3.2596067917783733, "grad_norm": 0.2561972141265869, "learning_rate": 4.2619212024631636e-05, "loss": 0.828, "num_input_tokens_seen": 12619312, "step": 21885 }, { "epoch": 3.260351504319333, "grad_norm": 0.2875092625617981, "learning_rate": 4.261460090731521e-05, "loss": 0.7784, "num_input_tokens_seen": 12622192, "step": 21890 }, { "epoch": 3.2610962168602917, "grad_norm": 0.1839170902967453, "learning_rate": 4.2609988599675665e-05, "loss": 0.812, "num_input_tokens_seen": 12624912, "step": 21895 }, { "epoch": 3.2618409294012514, "grad_norm": 0.22387947142124176, "learning_rate": 4.2605375102024694e-05, "loss": 0.801, "num_input_tokens_seen": 12628592, "step": 21900 }, { "epoch": 3.26258564194221, "grad_norm": 0.34676307439804077, "learning_rate": 4.2600760414674044e-05, "loss": 0.7984, "num_input_tokens_seen": 12631632, "step": 21905 }, { "epoch": 3.2633303544831693, "grad_norm": 0.24170517921447754, "learning_rate": 4.259614453793557e-05, "loss": 0.8215, "num_input_tokens_seen": 12634672, "step": 21910 }, { "epoch": 3.2640750670241285, "grad_norm": 0.2886103093624115, "learning_rate": 4.25915274721212e-05, "loss": 0.7845, "num_input_tokens_seen": 12637520, "step": 21915 }, { "epoch": 3.2648197795650877, "grad_norm": 0.33290034532546997, "learning_rate": 4.258690921754291e-05, "loss": 0.8039, "num_input_tokens_seen": 12640144, "step": 21920 }, { "epoch": 3.265564492106047, "grad_norm": 0.30003276467323303, "learning_rate": 4.25822897745128e-05, "loss": 0.8611, "num_input_tokens_seen": 12643088, "step": 21925 }, { "epoch": 3.266309204647006, "grad_norm": 0.23175203800201416, "learning_rate": 4.257766914334303e-05, "loss": 0.8082, "num_input_tokens_seen": 12645904, "step": 21930 }, { "epoch": 3.2670539171879653, "grad_norm": 0.23773139715194702, "learning_rate": 4.257304732434585e-05, "loss": 0.8117, "num_input_tokens_seen": 12648784, "step": 21935 }, { "epoch": 3.2677986297289245, "grad_norm": 0.20823493599891663, "learning_rate": 4.256842431783358e-05, "loss": 0.826, "num_input_tokens_seen": 12651632, "step": 21940 }, { "epoch": 3.2685433422698837, "grad_norm": 0.21354030072689056, "learning_rate": 4.256380012411862e-05, "loss": 0.8235, "num_input_tokens_seen": 12654544, "step": 21945 }, { "epoch": 3.269288054810843, "grad_norm": 0.26840460300445557, "learning_rate": 4.255917474351345e-05, "loss": 0.7784, "num_input_tokens_seen": 12657392, "step": 21950 }, { "epoch": 3.270032767351802, "grad_norm": 0.24950926005840302, "learning_rate": 4.2554548176330655e-05, "loss": 0.7933, "num_input_tokens_seen": 12660400, "step": 21955 }, { "epoch": 3.2707774798927614, "grad_norm": 0.20576050877571106, "learning_rate": 4.254992042288286e-05, "loss": 0.8191, "num_input_tokens_seen": 12663440, "step": 21960 }, { "epoch": 3.2715221924337206, "grad_norm": 0.2774025499820709, "learning_rate": 4.254529148348279e-05, "loss": 0.7915, "num_input_tokens_seen": 12666416, "step": 21965 }, { "epoch": 3.2722669049746798, "grad_norm": 0.20980966091156006, "learning_rate": 4.254066135844326e-05, "loss": 0.8267, "num_input_tokens_seen": 12669168, "step": 21970 }, { "epoch": 3.273011617515639, "grad_norm": 0.2882102429866791, "learning_rate": 4.253603004807715e-05, "loss": 0.804, "num_input_tokens_seen": 12672368, "step": 21975 }, { "epoch": 3.273756330056598, "grad_norm": 0.2925436496734619, "learning_rate": 4.253139755269743e-05, "loss": 0.8057, "num_input_tokens_seen": 12674896, "step": 21980 }, { "epoch": 3.2745010425975574, "grad_norm": 0.28794100880622864, "learning_rate": 4.2526763872617137e-05, "loss": 0.8183, "num_input_tokens_seen": 12677968, "step": 21985 }, { "epoch": 3.2752457551385166, "grad_norm": 0.21251778304576874, "learning_rate": 4.2522129008149395e-05, "loss": 0.7994, "num_input_tokens_seen": 12680816, "step": 21990 }, { "epoch": 3.2759904676794758, "grad_norm": 0.2692641019821167, "learning_rate": 4.2517492959607426e-05, "loss": 0.82, "num_input_tokens_seen": 12683664, "step": 21995 }, { "epoch": 3.276735180220435, "grad_norm": 0.2950257956981659, "learning_rate": 4.251285572730449e-05, "loss": 0.7845, "num_input_tokens_seen": 12686608, "step": 22000 }, { "epoch": 3.277479892761394, "grad_norm": 0.29005739092826843, "learning_rate": 4.250821731155398e-05, "loss": 0.8058, "num_input_tokens_seen": 12689392, "step": 22005 }, { "epoch": 3.2782246053023534, "grad_norm": 0.27368852496147156, "learning_rate": 4.250357771266932e-05, "loss": 0.7989, "num_input_tokens_seen": 12692240, "step": 22010 }, { "epoch": 3.2789693178433126, "grad_norm": 0.27353760600090027, "learning_rate": 4.249893693096404e-05, "loss": 0.8075, "num_input_tokens_seen": 12695344, "step": 22015 }, { "epoch": 3.279714030384272, "grad_norm": 0.3561364710330963, "learning_rate": 4.249429496675175e-05, "loss": 0.8252, "num_input_tokens_seen": 12697968, "step": 22020 }, { "epoch": 3.280458742925231, "grad_norm": 0.32920730113983154, "learning_rate": 4.248965182034613e-05, "loss": 0.8, "num_input_tokens_seen": 12701200, "step": 22025 }, { "epoch": 3.28120345546619, "grad_norm": 0.24337519705295563, "learning_rate": 4.248500749206096e-05, "loss": 0.8264, "num_input_tokens_seen": 12704016, "step": 22030 }, { "epoch": 3.2819481680071494, "grad_norm": 0.2339179813861847, "learning_rate": 4.248036198221006e-05, "loss": 0.793, "num_input_tokens_seen": 12706640, "step": 22035 }, { "epoch": 3.2826928805481086, "grad_norm": 0.27217474579811096, "learning_rate": 4.2475715291107374e-05, "loss": 0.8333, "num_input_tokens_seen": 12709392, "step": 22040 }, { "epoch": 3.283437593089068, "grad_norm": 0.2956182360649109, "learning_rate": 4.24710674190669e-05, "loss": 0.8814, "num_input_tokens_seen": 12711952, "step": 22045 }, { "epoch": 3.284182305630027, "grad_norm": 0.24666012823581696, "learning_rate": 4.2466418366402715e-05, "loss": 0.8055, "num_input_tokens_seen": 12714896, "step": 22050 }, { "epoch": 3.284927018170986, "grad_norm": 0.24364963173866272, "learning_rate": 4.2461768133428993e-05, "loss": 0.8182, "num_input_tokens_seen": 12717680, "step": 22055 }, { "epoch": 3.285671730711945, "grad_norm": 0.2830907702445984, "learning_rate": 4.2457116720459975e-05, "loss": 0.8254, "num_input_tokens_seen": 12720656, "step": 22060 }, { "epoch": 3.2864164432529046, "grad_norm": 0.2631911337375641, "learning_rate": 4.245246412780999e-05, "loss": 0.8188, "num_input_tokens_seen": 12723728, "step": 22065 }, { "epoch": 3.2871611557938634, "grad_norm": 0.27335160970687866, "learning_rate": 4.244781035579343e-05, "loss": 0.7886, "num_input_tokens_seen": 12726256, "step": 22070 }, { "epoch": 3.2879058683348226, "grad_norm": 0.22942876815795898, "learning_rate": 4.244315540472478e-05, "loss": 0.8168, "num_input_tokens_seen": 12728912, "step": 22075 }, { "epoch": 3.2886505808757818, "grad_norm": 0.25651657581329346, "learning_rate": 4.243849927491861e-05, "loss": 0.8116, "num_input_tokens_seen": 12731888, "step": 22080 }, { "epoch": 3.289395293416741, "grad_norm": 0.4424954652786255, "learning_rate": 4.2433841966689564e-05, "loss": 0.8159, "num_input_tokens_seen": 12734608, "step": 22085 }, { "epoch": 3.2901400059577, "grad_norm": 0.2530811131000519, "learning_rate": 4.2429183480352354e-05, "loss": 0.8124, "num_input_tokens_seen": 12737424, "step": 22090 }, { "epoch": 3.2908847184986594, "grad_norm": 0.2338629961013794, "learning_rate": 4.242452381622179e-05, "loss": 0.8182, "num_input_tokens_seen": 12740464, "step": 22095 }, { "epoch": 3.2916294310396186, "grad_norm": 0.2475656419992447, "learning_rate": 4.2419862974612744e-05, "loss": 0.7837, "num_input_tokens_seen": 12743408, "step": 22100 }, { "epoch": 3.292374143580578, "grad_norm": 0.24965542554855347, "learning_rate": 4.2415200955840184e-05, "loss": 0.795, "num_input_tokens_seen": 12746000, "step": 22105 }, { "epoch": 3.293118856121537, "grad_norm": 0.29400405287742615, "learning_rate": 4.241053776021915e-05, "loss": 0.8077, "num_input_tokens_seen": 12749040, "step": 22110 }, { "epoch": 3.293863568662496, "grad_norm": 0.21160449087619781, "learning_rate": 4.240587338806476e-05, "loss": 0.8161, "num_input_tokens_seen": 12751920, "step": 22115 }, { "epoch": 3.2946082812034554, "grad_norm": 0.28147146105766296, "learning_rate": 4.2401207839692217e-05, "loss": 0.7873, "num_input_tokens_seen": 12754704, "step": 22120 }, { "epoch": 3.2953529937444146, "grad_norm": 0.2681068480014801, "learning_rate": 4.239654111541679e-05, "loss": 0.814, "num_input_tokens_seen": 12757872, "step": 22125 }, { "epoch": 3.296097706285374, "grad_norm": 0.2717007100582123, "learning_rate": 4.239187321555384e-05, "loss": 0.8149, "num_input_tokens_seen": 12760976, "step": 22130 }, { "epoch": 3.296842418826333, "grad_norm": 0.2763012945652008, "learning_rate": 4.2387204140418815e-05, "loss": 0.8045, "num_input_tokens_seen": 12763952, "step": 22135 }, { "epoch": 3.297587131367292, "grad_norm": 0.22404645383358002, "learning_rate": 4.238253389032723e-05, "loss": 0.8072, "num_input_tokens_seen": 12767056, "step": 22140 }, { "epoch": 3.2983318439082514, "grad_norm": 0.24579548835754395, "learning_rate": 4.237786246559467e-05, "loss": 0.8298, "num_input_tokens_seen": 12769744, "step": 22145 }, { "epoch": 3.2990765564492106, "grad_norm": 0.20919834077358246, "learning_rate": 4.2373189866536815e-05, "loss": 0.7972, "num_input_tokens_seen": 12772656, "step": 22150 }, { "epoch": 3.29982126899017, "grad_norm": 0.21361298859119415, "learning_rate": 4.236851609346943e-05, "loss": 0.8193, "num_input_tokens_seen": 12775472, "step": 22155 }, { "epoch": 3.300565981531129, "grad_norm": 0.21754969656467438, "learning_rate": 4.236384114670834e-05, "loss": 0.7843, "num_input_tokens_seen": 12778352, "step": 22160 }, { "epoch": 3.301310694072088, "grad_norm": 0.23085525631904602, "learning_rate": 4.2359165026569455e-05, "loss": 0.7801, "num_input_tokens_seen": 12781200, "step": 22165 }, { "epoch": 3.3020554066130474, "grad_norm": 0.2213839441537857, "learning_rate": 4.235448773336878e-05, "loss": 0.8057, "num_input_tokens_seen": 12783888, "step": 22170 }, { "epoch": 3.3028001191540066, "grad_norm": 0.24609768390655518, "learning_rate": 4.234980926742239e-05, "loss": 0.8035, "num_input_tokens_seen": 12786672, "step": 22175 }, { "epoch": 3.303544831694966, "grad_norm": 0.23476192355155945, "learning_rate": 4.2345129629046425e-05, "loss": 0.7759, "num_input_tokens_seen": 12789488, "step": 22180 }, { "epoch": 3.304289544235925, "grad_norm": 0.2434861809015274, "learning_rate": 4.234044881855711e-05, "loss": 0.8269, "num_input_tokens_seen": 12792240, "step": 22185 }, { "epoch": 3.3050342567768842, "grad_norm": 0.2842976748943329, "learning_rate": 4.233576683627078e-05, "loss": 0.7729, "num_input_tokens_seen": 12794864, "step": 22190 }, { "epoch": 3.3057789693178434, "grad_norm": 0.29072096943855286, "learning_rate": 4.23310836825038e-05, "loss": 0.8243, "num_input_tokens_seen": 12797712, "step": 22195 }, { "epoch": 3.3065236818588026, "grad_norm": 0.24240222573280334, "learning_rate": 4.2326399357572654e-05, "loss": 0.8054, "num_input_tokens_seen": 12800784, "step": 22200 }, { "epoch": 3.307268394399762, "grad_norm": 0.279392808675766, "learning_rate": 4.232171386179388e-05, "loss": 0.8167, "num_input_tokens_seen": 12803664, "step": 22205 }, { "epoch": 3.308013106940721, "grad_norm": 0.20353198051452637, "learning_rate": 4.231702719548411e-05, "loss": 0.8131, "num_input_tokens_seen": 12806192, "step": 22210 }, { "epoch": 3.3087578194816802, "grad_norm": 0.2559942603111267, "learning_rate": 4.231233935896004e-05, "loss": 0.8144, "num_input_tokens_seen": 12809488, "step": 22215 }, { "epoch": 3.3095025320226394, "grad_norm": 0.2319115698337555, "learning_rate": 4.2307650352538465e-05, "loss": 0.7872, "num_input_tokens_seen": 12812176, "step": 22220 }, { "epoch": 3.310247244563598, "grad_norm": 0.22443021833896637, "learning_rate": 4.230296017653625e-05, "loss": 0.7853, "num_input_tokens_seen": 12815024, "step": 22225 }, { "epoch": 3.310991957104558, "grad_norm": 0.2589344084262848, "learning_rate": 4.2298268831270335e-05, "loss": 0.7876, "num_input_tokens_seen": 12817776, "step": 22230 }, { "epoch": 3.3117366696455166, "grad_norm": 0.23483067750930786, "learning_rate": 4.229357631705774e-05, "loss": 0.7737, "num_input_tokens_seen": 12820400, "step": 22235 }, { "epoch": 3.3124813821864763, "grad_norm": 0.22629185020923615, "learning_rate": 4.228888263421557e-05, "loss": 0.8056, "num_input_tokens_seen": 12823472, "step": 22240 }, { "epoch": 3.313226094727435, "grad_norm": 0.21800099313259125, "learning_rate": 4.2284187783061e-05, "loss": 0.7805, "num_input_tokens_seen": 12826032, "step": 22245 }, { "epoch": 3.313970807268394, "grad_norm": 0.19908921420574188, "learning_rate": 4.22794917639113e-05, "loss": 0.8126, "num_input_tokens_seen": 12828848, "step": 22250 }, { "epoch": 3.3147155198093534, "grad_norm": 0.2480025738477707, "learning_rate": 4.227479457708379e-05, "loss": 0.8213, "num_input_tokens_seen": 12831728, "step": 22255 }, { "epoch": 3.3154602323503126, "grad_norm": 0.3365470767021179, "learning_rate": 4.22700962228959e-05, "loss": 0.7791, "num_input_tokens_seen": 12834576, "step": 22260 }, { "epoch": 3.316204944891272, "grad_norm": 0.25759491324424744, "learning_rate": 4.2265396701665125e-05, "loss": 0.7849, "num_input_tokens_seen": 12837232, "step": 22265 }, { "epoch": 3.316949657432231, "grad_norm": 0.25142964720726013, "learning_rate": 4.226069601370904e-05, "loss": 0.8228, "num_input_tokens_seen": 12840016, "step": 22270 }, { "epoch": 3.3176943699731902, "grad_norm": 0.25604888796806335, "learning_rate": 4.225599415934529e-05, "loss": 0.8012, "num_input_tokens_seen": 12843024, "step": 22275 }, { "epoch": 3.3184390825141494, "grad_norm": 0.2282949686050415, "learning_rate": 4.225129113889161e-05, "loss": 0.8015, "num_input_tokens_seen": 12845872, "step": 22280 }, { "epoch": 3.3191837950551086, "grad_norm": 0.25319811701774597, "learning_rate": 4.224658695266582e-05, "loss": 0.8289, "num_input_tokens_seen": 12848720, "step": 22285 }, { "epoch": 3.319928507596068, "grad_norm": 0.34788304567337036, "learning_rate": 4.22418816009858e-05, "loss": 0.8108, "num_input_tokens_seen": 12851600, "step": 22290 }, { "epoch": 3.320673220137027, "grad_norm": 0.22935891151428223, "learning_rate": 4.223717508416952e-05, "loss": 0.7772, "num_input_tokens_seen": 12854224, "step": 22295 }, { "epoch": 3.3214179326779862, "grad_norm": 0.3362482488155365, "learning_rate": 4.2232467402535036e-05, "loss": 0.7956, "num_input_tokens_seen": 12857232, "step": 22300 }, { "epoch": 3.3221626452189454, "grad_norm": 0.3211548924446106, "learning_rate": 4.222775855640047e-05, "loss": 0.8627, "num_input_tokens_seen": 12860016, "step": 22305 }, { "epoch": 3.3229073577599046, "grad_norm": 0.2961863875389099, "learning_rate": 4.222304854608401e-05, "loss": 0.8059, "num_input_tokens_seen": 12863120, "step": 22310 }, { "epoch": 3.323652070300864, "grad_norm": 0.28586021065711975, "learning_rate": 4.221833737190396e-05, "loss": 0.7852, "num_input_tokens_seen": 12865808, "step": 22315 }, { "epoch": 3.324396782841823, "grad_norm": 0.27344927191734314, "learning_rate": 4.2213625034178674e-05, "loss": 0.7589, "num_input_tokens_seen": 12868848, "step": 22320 }, { "epoch": 3.3251414953827823, "grad_norm": 0.26106369495391846, "learning_rate": 4.220891153322659e-05, "loss": 0.8012, "num_input_tokens_seen": 12871312, "step": 22325 }, { "epoch": 3.3258862079237415, "grad_norm": 0.36495745182037354, "learning_rate": 4.220419686936623e-05, "loss": 0.8302, "num_input_tokens_seen": 12874384, "step": 22330 }, { "epoch": 3.3266309204647007, "grad_norm": 0.3677709996700287, "learning_rate": 4.21994810429162e-05, "loss": 0.815, "num_input_tokens_seen": 12876976, "step": 22335 }, { "epoch": 3.32737563300566, "grad_norm": 0.17393387854099274, "learning_rate": 4.2194764054195166e-05, "loss": 0.7985, "num_input_tokens_seen": 12879600, "step": 22340 }, { "epoch": 3.328120345546619, "grad_norm": 0.31836462020874023, "learning_rate": 4.219004590352189e-05, "loss": 0.8341, "num_input_tokens_seen": 12882672, "step": 22345 }, { "epoch": 3.3288650580875783, "grad_norm": 0.2644028663635254, "learning_rate": 4.2185326591215196e-05, "loss": 0.8064, "num_input_tokens_seen": 12885392, "step": 22350 }, { "epoch": 3.3296097706285375, "grad_norm": 0.2564474642276764, "learning_rate": 4.2180606117594e-05, "loss": 0.7984, "num_input_tokens_seen": 12888240, "step": 22355 }, { "epoch": 3.3303544831694967, "grad_norm": 0.26497557759284973, "learning_rate": 4.21758844829773e-05, "loss": 0.8067, "num_input_tokens_seen": 12890896, "step": 22360 }, { "epoch": 3.331099195710456, "grad_norm": 0.34923577308654785, "learning_rate": 4.2171161687684156e-05, "loss": 0.8089, "num_input_tokens_seen": 12893840, "step": 22365 }, { "epoch": 3.331843908251415, "grad_norm": 0.23922108113765717, "learning_rate": 4.216643773203372e-05, "loss": 0.7952, "num_input_tokens_seen": 12896688, "step": 22370 }, { "epoch": 3.3325886207923743, "grad_norm": 0.22178928554058075, "learning_rate": 4.216171261634521e-05, "loss": 0.781, "num_input_tokens_seen": 12899824, "step": 22375 }, { "epoch": 3.3333333333333335, "grad_norm": 0.2701725363731384, "learning_rate": 4.215698634093794e-05, "loss": 0.8442, "num_input_tokens_seen": 12902864, "step": 22380 }, { "epoch": 3.3340780458742927, "grad_norm": 0.23983284831047058, "learning_rate": 4.2152258906131295e-05, "loss": 0.79, "num_input_tokens_seen": 12905616, "step": 22385 }, { "epoch": 3.334822758415252, "grad_norm": 0.3247852623462677, "learning_rate": 4.214753031224472e-05, "loss": 0.7864, "num_input_tokens_seen": 12908816, "step": 22390 }, { "epoch": 3.335567470956211, "grad_norm": 0.22962084412574768, "learning_rate": 4.2142800559597764e-05, "loss": 0.8126, "num_input_tokens_seen": 12911536, "step": 22395 }, { "epoch": 3.33631218349717, "grad_norm": 0.22856742143630981, "learning_rate": 4.2138069648510045e-05, "loss": 0.8121, "num_input_tokens_seen": 12914384, "step": 22400 }, { "epoch": 3.3370568960381295, "grad_norm": 0.28045928478240967, "learning_rate": 4.2133337579301255e-05, "loss": 0.818, "num_input_tokens_seen": 12917136, "step": 22405 }, { "epoch": 3.3378016085790883, "grad_norm": 0.22151117026805878, "learning_rate": 4.212860435229117e-05, "loss": 0.8078, "num_input_tokens_seen": 12919824, "step": 22410 }, { "epoch": 3.338546321120048, "grad_norm": 0.2147866040468216, "learning_rate": 4.212386996779965e-05, "loss": 0.8126, "num_input_tokens_seen": 12922576, "step": 22415 }, { "epoch": 3.3392910336610067, "grad_norm": 0.22126901149749756, "learning_rate": 4.2119134426146614e-05, "loss": 0.8752, "num_input_tokens_seen": 12925648, "step": 22420 }, { "epoch": 3.340035746201966, "grad_norm": 0.26987090706825256, "learning_rate": 4.211439772765208e-05, "loss": 0.8057, "num_input_tokens_seen": 12928272, "step": 22425 }, { "epoch": 3.340780458742925, "grad_norm": 0.23492342233657837, "learning_rate": 4.210965987263612e-05, "loss": 0.8179, "num_input_tokens_seen": 12930960, "step": 22430 }, { "epoch": 3.3415251712838843, "grad_norm": 0.32465922832489014, "learning_rate": 4.2104920861418906e-05, "loss": 0.8051, "num_input_tokens_seen": 12933808, "step": 22435 }, { "epoch": 3.3422698838248435, "grad_norm": 0.2593085467815399, "learning_rate": 4.21001806943207e-05, "loss": 0.7956, "num_input_tokens_seen": 12936816, "step": 22440 }, { "epoch": 3.3430145963658027, "grad_norm": 0.29239341616630554, "learning_rate": 4.209543937166179e-05, "loss": 0.8018, "num_input_tokens_seen": 12939728, "step": 22445 }, { "epoch": 3.343759308906762, "grad_norm": 0.18263743817806244, "learning_rate": 4.2090696893762605e-05, "loss": 0.804, "num_input_tokens_seen": 12942544, "step": 22450 }, { "epoch": 3.344504021447721, "grad_norm": 0.21323487162590027, "learning_rate": 4.20859532609436e-05, "loss": 0.7899, "num_input_tokens_seen": 12945328, "step": 22455 }, { "epoch": 3.3452487339886803, "grad_norm": 0.24269282817840576, "learning_rate": 4.208120847352535e-05, "loss": 0.819, "num_input_tokens_seen": 12947824, "step": 22460 }, { "epoch": 3.3459934465296395, "grad_norm": 0.22338399291038513, "learning_rate": 4.207646253182847e-05, "loss": 0.7865, "num_input_tokens_seen": 12951120, "step": 22465 }, { "epoch": 3.3467381590705987, "grad_norm": 0.2981334924697876, "learning_rate": 4.207171543617369e-05, "loss": 0.8119, "num_input_tokens_seen": 12953904, "step": 22470 }, { "epoch": 3.347482871611558, "grad_norm": 0.22814011573791504, "learning_rate": 4.206696718688178e-05, "loss": 0.7896, "num_input_tokens_seen": 12956752, "step": 22475 }, { "epoch": 3.348227584152517, "grad_norm": 0.26190677285194397, "learning_rate": 4.206221778427362e-05, "loss": 0.7797, "num_input_tokens_seen": 12959920, "step": 22480 }, { "epoch": 3.3489722966934763, "grad_norm": 0.29746076464653015, "learning_rate": 4.205746722867014e-05, "loss": 0.8053, "num_input_tokens_seen": 12962704, "step": 22485 }, { "epoch": 3.3497170092344355, "grad_norm": 0.24662254750728607, "learning_rate": 4.2052715520392397e-05, "loss": 0.7686, "num_input_tokens_seen": 12965712, "step": 22490 }, { "epoch": 3.3504617217753947, "grad_norm": 0.29240912199020386, "learning_rate": 4.2047962659761454e-05, "loss": 0.7607, "num_input_tokens_seen": 12968592, "step": 22495 }, { "epoch": 3.351206434316354, "grad_norm": 0.236784428358078, "learning_rate": 4.204320864709852e-05, "loss": 0.7558, "num_input_tokens_seen": 12971664, "step": 22500 }, { "epoch": 3.351951146857313, "grad_norm": 0.3373023569583893, "learning_rate": 4.203845348272483e-05, "loss": 0.8129, "num_input_tokens_seen": 12974448, "step": 22505 }, { "epoch": 3.3526958593982723, "grad_norm": 0.3023706078529358, "learning_rate": 4.2033697166961716e-05, "loss": 0.77, "num_input_tokens_seen": 12977424, "step": 22510 }, { "epoch": 3.3534405719392315, "grad_norm": 0.2636207640171051, "learning_rate": 4.202893970013062e-05, "loss": 0.7559, "num_input_tokens_seen": 12980272, "step": 22515 }, { "epoch": 3.3541852844801907, "grad_norm": 0.26460984349250793, "learning_rate": 4.202418108255301e-05, "loss": 0.792, "num_input_tokens_seen": 12983184, "step": 22520 }, { "epoch": 3.35492999702115, "grad_norm": 0.2276412695646286, "learning_rate": 4.201942131455045e-05, "loss": 0.7845, "num_input_tokens_seen": 12986096, "step": 22525 }, { "epoch": 3.355674709562109, "grad_norm": 0.3259781002998352, "learning_rate": 4.2014660396444596e-05, "loss": 0.7878, "num_input_tokens_seen": 12989072, "step": 22530 }, { "epoch": 3.3564194221030683, "grad_norm": 0.2610168159008026, "learning_rate": 4.200989832855717e-05, "loss": 0.7879, "num_input_tokens_seen": 12991984, "step": 22535 }, { "epoch": 3.3571641346440275, "grad_norm": 0.31111428141593933, "learning_rate": 4.2005135111209976e-05, "loss": 0.8073, "num_input_tokens_seen": 12994704, "step": 22540 }, { "epoch": 3.3579088471849867, "grad_norm": 0.24333375692367554, "learning_rate": 4.200037074472488e-05, "loss": 0.8172, "num_input_tokens_seen": 12997616, "step": 22545 }, { "epoch": 3.358653559725946, "grad_norm": 0.26570001244544983, "learning_rate": 4.1995605229423856e-05, "loss": 0.7765, "num_input_tokens_seen": 13000336, "step": 22550 }, { "epoch": 3.359398272266905, "grad_norm": 0.22768773138523102, "learning_rate": 4.199083856562893e-05, "loss": 0.7928, "num_input_tokens_seen": 13003024, "step": 22555 }, { "epoch": 3.3601429848078643, "grad_norm": 0.34146028757095337, "learning_rate": 4.198607075366221e-05, "loss": 0.8382, "num_input_tokens_seen": 13005968, "step": 22560 }, { "epoch": 3.3608876973488235, "grad_norm": 0.2908654808998108, "learning_rate": 4.198130179384589e-05, "loss": 0.8384, "num_input_tokens_seen": 13008656, "step": 22565 }, { "epoch": 3.3616324098897827, "grad_norm": 0.2619028687477112, "learning_rate": 4.197653168650223e-05, "loss": 0.819, "num_input_tokens_seen": 13011696, "step": 22570 }, { "epoch": 3.3623771224307415, "grad_norm": 0.27743858098983765, "learning_rate": 4.197176043195359e-05, "loss": 0.8128, "num_input_tokens_seen": 13014448, "step": 22575 }, { "epoch": 3.363121834971701, "grad_norm": 0.3857404291629791, "learning_rate": 4.196698803052237e-05, "loss": 0.769, "num_input_tokens_seen": 13017552, "step": 22580 }, { "epoch": 3.36386654751266, "grad_norm": 0.23603446781635284, "learning_rate": 4.196221448253109e-05, "loss": 0.8049, "num_input_tokens_seen": 13020720, "step": 22585 }, { "epoch": 3.3646112600536195, "grad_norm": 0.17002177238464355, "learning_rate": 4.1957439788302325e-05, "loss": 0.8597, "num_input_tokens_seen": 13023344, "step": 22590 }, { "epoch": 3.3653559725945783, "grad_norm": 0.2601887881755829, "learning_rate": 4.195266394815871e-05, "loss": 0.7547, "num_input_tokens_seen": 13026672, "step": 22595 }, { "epoch": 3.3661006851355375, "grad_norm": 0.2956496477127075, "learning_rate": 4.1947886962423e-05, "loss": 0.7801, "num_input_tokens_seen": 13029744, "step": 22600 }, { "epoch": 3.3668453976764967, "grad_norm": 0.26727864146232605, "learning_rate": 4.1943108831417987e-05, "loss": 0.7999, "num_input_tokens_seen": 13032720, "step": 22605 }, { "epoch": 3.367590110217456, "grad_norm": 0.2738741934299469, "learning_rate": 4.193832955546657e-05, "loss": 0.7938, "num_input_tokens_seen": 13036080, "step": 22610 }, { "epoch": 3.368334822758415, "grad_norm": 0.23302678763866425, "learning_rate": 4.1933549134891706e-05, "loss": 0.8424, "num_input_tokens_seen": 13038832, "step": 22615 }, { "epoch": 3.3690795352993743, "grad_norm": 0.23050962388515472, "learning_rate": 4.192876757001643e-05, "loss": 0.789, "num_input_tokens_seen": 13041616, "step": 22620 }, { "epoch": 3.3698242478403335, "grad_norm": 0.17694859206676483, "learning_rate": 4.1923984861163886e-05, "loss": 0.8139, "num_input_tokens_seen": 13044592, "step": 22625 }, { "epoch": 3.3705689603812927, "grad_norm": 0.2498641014099121, "learning_rate": 4.191920100865724e-05, "loss": 0.8185, "num_input_tokens_seen": 13047248, "step": 22630 }, { "epoch": 3.371313672922252, "grad_norm": 0.29449933767318726, "learning_rate": 4.191441601281978e-05, "loss": 0.8228, "num_input_tokens_seen": 13050096, "step": 22635 }, { "epoch": 3.372058385463211, "grad_norm": 0.22908274829387665, "learning_rate": 4.1909629873974865e-05, "loss": 0.8191, "num_input_tokens_seen": 13053040, "step": 22640 }, { "epoch": 3.3728030980041703, "grad_norm": 0.3294723629951477, "learning_rate": 4.1904842592445906e-05, "loss": 0.8403, "num_input_tokens_seen": 13055920, "step": 22645 }, { "epoch": 3.3735478105451295, "grad_norm": 0.2015332579612732, "learning_rate": 4.190005416855641e-05, "loss": 0.7837, "num_input_tokens_seen": 13058672, "step": 22650 }, { "epoch": 3.3742925230860887, "grad_norm": 0.2762283384799957, "learning_rate": 4.1895264602629966e-05, "loss": 0.8251, "num_input_tokens_seen": 13061776, "step": 22655 }, { "epoch": 3.375037235627048, "grad_norm": 0.2682175636291504, "learning_rate": 4.189047389499023e-05, "loss": 0.8183, "num_input_tokens_seen": 13065040, "step": 22660 }, { "epoch": 3.375781948168007, "grad_norm": 0.28056636452674866, "learning_rate": 4.1885682045960945e-05, "loss": 0.8032, "num_input_tokens_seen": 13067984, "step": 22665 }, { "epoch": 3.3765266607089663, "grad_norm": 0.3041004538536072, "learning_rate": 4.188088905586591e-05, "loss": 0.8261, "num_input_tokens_seen": 13070800, "step": 22670 }, { "epoch": 3.3772713732499255, "grad_norm": 0.3960741460323334, "learning_rate": 4.1876094925029036e-05, "loss": 0.8366, "num_input_tokens_seen": 13073712, "step": 22675 }, { "epoch": 3.3780160857908847, "grad_norm": 0.33880022168159485, "learning_rate": 4.187129965377427e-05, "loss": 0.8086, "num_input_tokens_seen": 13076400, "step": 22680 }, { "epoch": 3.378760798331844, "grad_norm": 0.287634938955307, "learning_rate": 4.186650324242568e-05, "loss": 0.7848, "num_input_tokens_seen": 13079536, "step": 22685 }, { "epoch": 3.379505510872803, "grad_norm": 0.31132668256759644, "learning_rate": 4.186170569130737e-05, "loss": 0.8345, "num_input_tokens_seen": 13082416, "step": 22690 }, { "epoch": 3.3802502234137624, "grad_norm": 0.32020121812820435, "learning_rate": 4.185690700074354e-05, "loss": 0.8196, "num_input_tokens_seen": 13085008, "step": 22695 }, { "epoch": 3.3809949359547216, "grad_norm": 0.26591503620147705, "learning_rate": 4.185210717105848e-05, "loss": 0.7949, "num_input_tokens_seen": 13087824, "step": 22700 }, { "epoch": 3.3817396484956808, "grad_norm": 0.30564025044441223, "learning_rate": 4.184730620257652e-05, "loss": 0.7888, "num_input_tokens_seen": 13090416, "step": 22705 }, { "epoch": 3.38248436103664, "grad_norm": 0.35073384642601013, "learning_rate": 4.18425040956221e-05, "loss": 0.8248, "num_input_tokens_seen": 13093968, "step": 22710 }, { "epoch": 3.383229073577599, "grad_norm": 0.28661465644836426, "learning_rate": 4.183770085051974e-05, "loss": 0.8072, "num_input_tokens_seen": 13096752, "step": 22715 }, { "epoch": 3.3839737861185584, "grad_norm": 0.2550511062145233, "learning_rate": 4.183289646759402e-05, "loss": 0.809, "num_input_tokens_seen": 13099536, "step": 22720 }, { "epoch": 3.3847184986595176, "grad_norm": 0.268213152885437, "learning_rate": 4.182809094716958e-05, "loss": 0.8394, "num_input_tokens_seen": 13102128, "step": 22725 }, { "epoch": 3.3854632112004768, "grad_norm": 0.30480024218559265, "learning_rate": 4.182328428957118e-05, "loss": 0.7933, "num_input_tokens_seen": 13105168, "step": 22730 }, { "epoch": 3.386207923741436, "grad_norm": 0.28666749596595764, "learning_rate": 4.181847649512362e-05, "loss": 0.8151, "num_input_tokens_seen": 13108272, "step": 22735 }, { "epoch": 3.386952636282395, "grad_norm": 0.35810595750808716, "learning_rate": 4.181366756415181e-05, "loss": 0.8076, "num_input_tokens_seen": 13111184, "step": 22740 }, { "epoch": 3.3876973488233544, "grad_norm": 0.20672401785850525, "learning_rate": 4.18088574969807e-05, "loss": 0.825, "num_input_tokens_seen": 13114000, "step": 22745 }, { "epoch": 3.388442061364313, "grad_norm": 0.3252500891685486, "learning_rate": 4.1804046293935334e-05, "loss": 0.8308, "num_input_tokens_seen": 13116944, "step": 22750 }, { "epoch": 3.389186773905273, "grad_norm": 0.19847989082336426, "learning_rate": 4.179923395534084e-05, "loss": 0.814, "num_input_tokens_seen": 13119856, "step": 22755 }, { "epoch": 3.3899314864462315, "grad_norm": 0.29339101910591125, "learning_rate": 4.1794420481522424e-05, "loss": 0.8247, "num_input_tokens_seen": 13123056, "step": 22760 }, { "epoch": 3.390676198987191, "grad_norm": 0.5124849081039429, "learning_rate": 4.178960587280535e-05, "loss": 0.8411, "num_input_tokens_seen": 13126032, "step": 22765 }, { "epoch": 3.39142091152815, "grad_norm": 0.2978195548057556, "learning_rate": 4.178479012951497e-05, "loss": 0.8252, "num_input_tokens_seen": 13129200, "step": 22770 }, { "epoch": 3.392165624069109, "grad_norm": 0.2663518488407135, "learning_rate": 4.177997325197671e-05, "loss": 0.779, "num_input_tokens_seen": 13131920, "step": 22775 }, { "epoch": 3.3929103366100684, "grad_norm": 0.20987989008426666, "learning_rate": 4.177515524051609e-05, "loss": 0.8248, "num_input_tokens_seen": 13134704, "step": 22780 }, { "epoch": 3.3936550491510276, "grad_norm": 0.22616420686244965, "learning_rate": 4.1770336095458676e-05, "loss": 0.8233, "num_input_tokens_seen": 13137424, "step": 22785 }, { "epoch": 3.3943997616919868, "grad_norm": 0.27745887637138367, "learning_rate": 4.176551581713013e-05, "loss": 0.8013, "num_input_tokens_seen": 13140336, "step": 22790 }, { "epoch": 3.395144474232946, "grad_norm": 0.29888468980789185, "learning_rate": 4.1760694405856194e-05, "loss": 0.8164, "num_input_tokens_seen": 13142960, "step": 22795 }, { "epoch": 3.395889186773905, "grad_norm": 0.22492681443691254, "learning_rate": 4.1755871861962674e-05, "loss": 0.8163, "num_input_tokens_seen": 13145872, "step": 22800 }, { "epoch": 3.3966338993148644, "grad_norm": 0.2432076781988144, "learning_rate": 4.175104818577545e-05, "loss": 0.8105, "num_input_tokens_seen": 13148880, "step": 22805 }, { "epoch": 3.3973786118558236, "grad_norm": 0.2516392767429352, "learning_rate": 4.174622337762051e-05, "loss": 0.8075, "num_input_tokens_seen": 13151856, "step": 22810 }, { "epoch": 3.3981233243967828, "grad_norm": 0.2762080132961273, "learning_rate": 4.174139743782387e-05, "loss": 0.8212, "num_input_tokens_seen": 13155088, "step": 22815 }, { "epoch": 3.398868036937742, "grad_norm": 0.19874674081802368, "learning_rate": 4.173657036671166e-05, "loss": 0.7967, "num_input_tokens_seen": 13157872, "step": 22820 }, { "epoch": 3.399612749478701, "grad_norm": 0.3571130633354187, "learning_rate": 4.173174216461006e-05, "loss": 0.8086, "num_input_tokens_seen": 13160912, "step": 22825 }, { "epoch": 3.4003574620196604, "grad_norm": 0.29543524980545044, "learning_rate": 4.172691283184536e-05, "loss": 0.8038, "num_input_tokens_seen": 13163888, "step": 22830 }, { "epoch": 3.4011021745606196, "grad_norm": 0.31652578711509705, "learning_rate": 4.172208236874389e-05, "loss": 0.8003, "num_input_tokens_seen": 13166608, "step": 22835 }, { "epoch": 3.401846887101579, "grad_norm": 0.19586066901683807, "learning_rate": 4.1717250775632086e-05, "loss": 0.7927, "num_input_tokens_seen": 13169424, "step": 22840 }, { "epoch": 3.402591599642538, "grad_norm": 0.27944415807724, "learning_rate": 4.1712418052836445e-05, "loss": 0.8194, "num_input_tokens_seen": 13172336, "step": 22845 }, { "epoch": 3.403336312183497, "grad_norm": 0.4021327495574951, "learning_rate": 4.1707584200683535e-05, "loss": 0.7818, "num_input_tokens_seen": 13175440, "step": 22850 }, { "epoch": 3.4040810247244564, "grad_norm": 0.25277000665664673, "learning_rate": 4.170274921950001e-05, "loss": 0.8055, "num_input_tokens_seen": 13178352, "step": 22855 }, { "epoch": 3.4048257372654156, "grad_norm": 0.37045374512672424, "learning_rate": 4.169791310961261e-05, "loss": 0.7896, "num_input_tokens_seen": 13181456, "step": 22860 }, { "epoch": 3.405570449806375, "grad_norm": 0.29853302240371704, "learning_rate": 4.169307587134813e-05, "loss": 0.8117, "num_input_tokens_seen": 13184080, "step": 22865 }, { "epoch": 3.406315162347334, "grad_norm": 0.2017640769481659, "learning_rate": 4.1688237505033454e-05, "loss": 0.8074, "num_input_tokens_seen": 13186736, "step": 22870 }, { "epoch": 3.407059874888293, "grad_norm": 0.23290018737316132, "learning_rate": 4.168339801099552e-05, "loss": 0.8267, "num_input_tokens_seen": 13189872, "step": 22875 }, { "epoch": 3.4078045874292524, "grad_norm": 0.2850748896598816, "learning_rate": 4.167855738956139e-05, "loss": 0.8002, "num_input_tokens_seen": 13192720, "step": 22880 }, { "epoch": 3.4085492999702116, "grad_norm": 0.25534743070602417, "learning_rate": 4.1673715641058165e-05, "loss": 0.8062, "num_input_tokens_seen": 13196048, "step": 22885 }, { "epoch": 3.409294012511171, "grad_norm": 0.27455711364746094, "learning_rate": 4.1668872765813025e-05, "loss": 0.7767, "num_input_tokens_seen": 13198736, "step": 22890 }, { "epoch": 3.41003872505213, "grad_norm": 0.2725590765476227, "learning_rate": 4.166402876415323e-05, "loss": 0.8025, "num_input_tokens_seen": 13201776, "step": 22895 }, { "epoch": 3.410783437593089, "grad_norm": 0.3133699893951416, "learning_rate": 4.1659183636406126e-05, "loss": 0.8098, "num_input_tokens_seen": 13204624, "step": 22900 }, { "epoch": 3.4115281501340484, "grad_norm": 0.2341616302728653, "learning_rate": 4.165433738289912e-05, "loss": 0.7541, "num_input_tokens_seen": 13207696, "step": 22905 }, { "epoch": 3.4122728626750076, "grad_norm": 0.28678396344184875, "learning_rate": 4.164949000395971e-05, "loss": 0.8016, "num_input_tokens_seen": 13210448, "step": 22910 }, { "epoch": 3.413017575215967, "grad_norm": 0.24365225434303284, "learning_rate": 4.1644641499915454e-05, "loss": 0.8016, "num_input_tokens_seen": 13213264, "step": 22915 }, { "epoch": 3.413762287756926, "grad_norm": 0.3349873721599579, "learning_rate": 4.1639791871094e-05, "loss": 0.8153, "num_input_tokens_seen": 13215952, "step": 22920 }, { "epoch": 3.414507000297885, "grad_norm": 0.24603283405303955, "learning_rate": 4.1634941117823065e-05, "loss": 0.7906, "num_input_tokens_seen": 13218512, "step": 22925 }, { "epoch": 3.4152517128388444, "grad_norm": 0.18752063810825348, "learning_rate": 4.1630089240430434e-05, "loss": 0.7802, "num_input_tokens_seen": 13221584, "step": 22930 }, { "epoch": 3.415996425379803, "grad_norm": 0.25732725858688354, "learning_rate": 4.162523623924399e-05, "loss": 0.7939, "num_input_tokens_seen": 13224368, "step": 22935 }, { "epoch": 3.4167411379207624, "grad_norm": 0.1758674681186676, "learning_rate": 4.162038211459167e-05, "loss": 0.7972, "num_input_tokens_seen": 13226928, "step": 22940 }, { "epoch": 3.4174858504617216, "grad_norm": 0.2918959856033325, "learning_rate": 4.161552686680151e-05, "loss": 0.78, "num_input_tokens_seen": 13229776, "step": 22945 }, { "epoch": 3.418230563002681, "grad_norm": 0.2701438069343567, "learning_rate": 4.161067049620159e-05, "loss": 0.7627, "num_input_tokens_seen": 13232624, "step": 22950 }, { "epoch": 3.41897527554364, "grad_norm": 0.33210980892181396, "learning_rate": 4.16058130031201e-05, "loss": 0.8214, "num_input_tokens_seen": 13235312, "step": 22955 }, { "epoch": 3.419719988084599, "grad_norm": 0.26555246114730835, "learning_rate": 4.160095438788527e-05, "loss": 0.8252, "num_input_tokens_seen": 13238000, "step": 22960 }, { "epoch": 3.4204647006255584, "grad_norm": 0.3285427391529083, "learning_rate": 4.1596094650825446e-05, "loss": 0.7994, "num_input_tokens_seen": 13240912, "step": 22965 }, { "epoch": 3.4212094131665176, "grad_norm": 0.2849109470844269, "learning_rate": 4.159123379226902e-05, "loss": 0.8029, "num_input_tokens_seen": 13243696, "step": 22970 }, { "epoch": 3.421954125707477, "grad_norm": 0.2711234986782074, "learning_rate": 4.158637181254447e-05, "loss": 0.7594, "num_input_tokens_seen": 13246992, "step": 22975 }, { "epoch": 3.422698838248436, "grad_norm": 0.2719402015209198, "learning_rate": 4.158150871198034e-05, "loss": 0.772, "num_input_tokens_seen": 13249808, "step": 22980 }, { "epoch": 3.423443550789395, "grad_norm": 0.3068862855434418, "learning_rate": 4.157664449090527e-05, "loss": 0.7641, "num_input_tokens_seen": 13253040, "step": 22985 }, { "epoch": 3.4241882633303544, "grad_norm": 0.371656209230423, "learning_rate": 4.1571779149647964e-05, "loss": 0.8153, "num_input_tokens_seen": 13256240, "step": 22990 }, { "epoch": 3.4249329758713136, "grad_norm": 0.24443063139915466, "learning_rate": 4.1566912688537195e-05, "loss": 0.8002, "num_input_tokens_seen": 13258928, "step": 22995 }, { "epoch": 3.425677688412273, "grad_norm": 0.257968932390213, "learning_rate": 4.156204510790183e-05, "loss": 0.7912, "num_input_tokens_seen": 13261712, "step": 23000 }, { "epoch": 3.426422400953232, "grad_norm": 0.26341837644577026, "learning_rate": 4.1557176408070784e-05, "loss": 0.8183, "num_input_tokens_seen": 13264400, "step": 23005 }, { "epoch": 3.4271671134941912, "grad_norm": 0.26132476329803467, "learning_rate": 4.155230658937308e-05, "loss": 0.8268, "num_input_tokens_seen": 13267440, "step": 23010 }, { "epoch": 3.4279118260351504, "grad_norm": 0.22444947063922882, "learning_rate": 4.154743565213779e-05, "loss": 0.8077, "num_input_tokens_seen": 13270544, "step": 23015 }, { "epoch": 3.4286565385761096, "grad_norm": 0.28229954838752747, "learning_rate": 4.154256359669408e-05, "loss": 0.8069, "num_input_tokens_seen": 13273456, "step": 23020 }, { "epoch": 3.429401251117069, "grad_norm": 0.2663065791130066, "learning_rate": 4.153769042337118e-05, "loss": 0.8575, "num_input_tokens_seen": 13276496, "step": 23025 }, { "epoch": 3.430145963658028, "grad_norm": 0.27256688475608826, "learning_rate": 4.153281613249839e-05, "loss": 0.8101, "num_input_tokens_seen": 13279152, "step": 23030 }, { "epoch": 3.4308906761989872, "grad_norm": 0.2026795893907547, "learning_rate": 4.152794072440511e-05, "loss": 0.7658, "num_input_tokens_seen": 13282192, "step": 23035 }, { "epoch": 3.4316353887399464, "grad_norm": 0.24685075879096985, "learning_rate": 4.1523064199420786e-05, "loss": 0.808, "num_input_tokens_seen": 13285136, "step": 23040 }, { "epoch": 3.4323801012809056, "grad_norm": 0.3272766172885895, "learning_rate": 4.1518186557874974e-05, "loss": 0.8088, "num_input_tokens_seen": 13288176, "step": 23045 }, { "epoch": 3.433124813821865, "grad_norm": 0.25283345580101013, "learning_rate": 4.151330780009726e-05, "loss": 0.795, "num_input_tokens_seen": 13290864, "step": 23050 }, { "epoch": 3.433869526362824, "grad_norm": 0.4618271589279175, "learning_rate": 4.150842792641735e-05, "loss": 0.8167, "num_input_tokens_seen": 13293968, "step": 23055 }, { "epoch": 3.4346142389037833, "grad_norm": 0.2182805985212326, "learning_rate": 4.1503546937165e-05, "loss": 0.7968, "num_input_tokens_seen": 13296592, "step": 23060 }, { "epoch": 3.4353589514447425, "grad_norm": 0.20027242600917816, "learning_rate": 4.1498664832670045e-05, "loss": 0.7816, "num_input_tokens_seen": 13299216, "step": 23065 }, { "epoch": 3.4361036639857017, "grad_norm": 0.24858388304710388, "learning_rate": 4.149378161326239e-05, "loss": 0.8234, "num_input_tokens_seen": 13302096, "step": 23070 }, { "epoch": 3.436848376526661, "grad_norm": 0.1597658395767212, "learning_rate": 4.148889727927204e-05, "loss": 0.7971, "num_input_tokens_seen": 13304880, "step": 23075 }, { "epoch": 3.43759308906762, "grad_norm": 0.2603559195995331, "learning_rate": 4.1484011831029054e-05, "loss": 0.8303, "num_input_tokens_seen": 13307696, "step": 23080 }, { "epoch": 3.4383378016085793, "grad_norm": 0.2250351756811142, "learning_rate": 4.147912526886356e-05, "loss": 0.8276, "num_input_tokens_seen": 13310704, "step": 23085 }, { "epoch": 3.4390825141495385, "grad_norm": 0.2780987322330475, "learning_rate": 4.147423759310579e-05, "loss": 0.7818, "num_input_tokens_seen": 13313808, "step": 23090 }, { "epoch": 3.4398272266904977, "grad_norm": 0.28868329524993896, "learning_rate": 4.1469348804086016e-05, "loss": 0.8351, "num_input_tokens_seen": 13316592, "step": 23095 }, { "epoch": 3.4405719392314564, "grad_norm": 0.31823405623435974, "learning_rate": 4.14644589021346e-05, "loss": 0.833, "num_input_tokens_seen": 13319600, "step": 23100 }, { "epoch": 3.441316651772416, "grad_norm": 0.19420595467090607, "learning_rate": 4.1459567887582015e-05, "loss": 0.7996, "num_input_tokens_seen": 13322768, "step": 23105 }, { "epoch": 3.442061364313375, "grad_norm": 0.31200897693634033, "learning_rate": 4.145467576075874e-05, "loss": 0.809, "num_input_tokens_seen": 13325872, "step": 23110 }, { "epoch": 3.442806076854334, "grad_norm": 0.3453998565673828, "learning_rate": 4.144978252199537e-05, "loss": 0.796, "num_input_tokens_seen": 13328976, "step": 23115 }, { "epoch": 3.4435507893952932, "grad_norm": 0.27296561002731323, "learning_rate": 4.1444888171622584e-05, "loss": 0.8373, "num_input_tokens_seen": 13331568, "step": 23120 }, { "epoch": 3.4442955019362524, "grad_norm": 0.3096604347229004, "learning_rate": 4.143999270997111e-05, "loss": 0.8006, "num_input_tokens_seen": 13334288, "step": 23125 }, { "epoch": 3.4450402144772116, "grad_norm": 0.22483639419078827, "learning_rate": 4.143509613737178e-05, "loss": 0.8073, "num_input_tokens_seen": 13336816, "step": 23130 }, { "epoch": 3.445784927018171, "grad_norm": 0.28260865807533264, "learning_rate": 4.143019845415546e-05, "loss": 0.8257, "num_input_tokens_seen": 13339760, "step": 23135 }, { "epoch": 3.44652963955913, "grad_norm": 0.242373526096344, "learning_rate": 4.142529966065314e-05, "loss": 0.8406, "num_input_tokens_seen": 13342736, "step": 23140 }, { "epoch": 3.4472743521000893, "grad_norm": 0.23047949373722076, "learning_rate": 4.1420399757195845e-05, "loss": 0.8157, "num_input_tokens_seen": 13345776, "step": 23145 }, { "epoch": 3.4480190646410485, "grad_norm": 0.2509438991546631, "learning_rate": 4.141549874411469e-05, "loss": 0.7843, "num_input_tokens_seen": 13348592, "step": 23150 }, { "epoch": 3.4487637771820077, "grad_norm": 0.2625637948513031, "learning_rate": 4.1410596621740874e-05, "loss": 0.7886, "num_input_tokens_seen": 13351504, "step": 23155 }, { "epoch": 3.449508489722967, "grad_norm": 0.2816135585308075, "learning_rate": 4.140569339040566e-05, "loss": 0.8171, "num_input_tokens_seen": 13354256, "step": 23160 }, { "epoch": 3.450253202263926, "grad_norm": 0.27610233426094055, "learning_rate": 4.140078905044039e-05, "loss": 0.808, "num_input_tokens_seen": 13357040, "step": 23165 }, { "epoch": 3.4509979148048853, "grad_norm": 0.331493079662323, "learning_rate": 4.1395883602176466e-05, "loss": 0.8276, "num_input_tokens_seen": 13360176, "step": 23170 }, { "epoch": 3.4517426273458445, "grad_norm": 0.313522070646286, "learning_rate": 4.13909770459454e-05, "loss": 0.811, "num_input_tokens_seen": 13363152, "step": 23175 }, { "epoch": 3.4524873398868037, "grad_norm": 0.19931088387966156, "learning_rate": 4.138606938207874e-05, "loss": 0.7947, "num_input_tokens_seen": 13366064, "step": 23180 }, { "epoch": 3.453232052427763, "grad_norm": 0.26516249775886536, "learning_rate": 4.1381160610908134e-05, "loss": 0.8002, "num_input_tokens_seen": 13368784, "step": 23185 }, { "epoch": 3.453976764968722, "grad_norm": 0.36498314142227173, "learning_rate": 4.13762507327653e-05, "loss": 0.8208, "num_input_tokens_seen": 13371792, "step": 23190 }, { "epoch": 3.4547214775096813, "grad_norm": 0.2739724814891815, "learning_rate": 4.137133974798202e-05, "loss": 0.8401, "num_input_tokens_seen": 13374864, "step": 23195 }, { "epoch": 3.4554661900506405, "grad_norm": 0.26378288865089417, "learning_rate": 4.1366427656890156e-05, "loss": 0.8143, "num_input_tokens_seen": 13377424, "step": 23200 }, { "epoch": 3.4562109025915997, "grad_norm": 0.2561686933040619, "learning_rate": 4.136151445982165e-05, "loss": 0.8166, "num_input_tokens_seen": 13380080, "step": 23205 }, { "epoch": 3.456955615132559, "grad_norm": 0.191733717918396, "learning_rate": 4.135660015710853e-05, "loss": 0.793, "num_input_tokens_seen": 13382928, "step": 23210 }, { "epoch": 3.457700327673518, "grad_norm": 0.31906580924987793, "learning_rate": 4.1351684749082866e-05, "loss": 0.8069, "num_input_tokens_seen": 13385872, "step": 23215 }, { "epoch": 3.4584450402144773, "grad_norm": 0.17914454638957977, "learning_rate": 4.1346768236076825e-05, "loss": 0.7863, "num_input_tokens_seen": 13388880, "step": 23220 }, { "epoch": 3.4591897527554365, "grad_norm": 0.22633253037929535, "learning_rate": 4.134185061842265e-05, "loss": 0.8216, "num_input_tokens_seen": 13391792, "step": 23225 }, { "epoch": 3.4599344652963957, "grad_norm": 0.24927671253681183, "learning_rate": 4.133693189645265e-05, "loss": 0.8033, "num_input_tokens_seen": 13394672, "step": 23230 }, { "epoch": 3.460679177837355, "grad_norm": 0.31446823477745056, "learning_rate": 4.133201207049921e-05, "loss": 0.8047, "num_input_tokens_seen": 13397584, "step": 23235 }, { "epoch": 3.461423890378314, "grad_norm": 0.25214168429374695, "learning_rate": 4.1327091140894805e-05, "loss": 0.8232, "num_input_tokens_seen": 13400400, "step": 23240 }, { "epoch": 3.4621686029192733, "grad_norm": 0.26673391461372375, "learning_rate": 4.132216910797195e-05, "loss": 0.8436, "num_input_tokens_seen": 13403152, "step": 23245 }, { "epoch": 3.4629133154602325, "grad_norm": 0.27882274985313416, "learning_rate": 4.131724597206328e-05, "loss": 0.7845, "num_input_tokens_seen": 13405776, "step": 23250 }, { "epoch": 3.4636580280011917, "grad_norm": 0.24068215489387512, "learning_rate": 4.131232173350146e-05, "loss": 0.8117, "num_input_tokens_seen": 13408592, "step": 23255 }, { "epoch": 3.464402740542151, "grad_norm": 0.3526289165019989, "learning_rate": 4.130739639261926e-05, "loss": 0.7836, "num_input_tokens_seen": 13411376, "step": 23260 }, { "epoch": 3.4651474530831097, "grad_norm": 0.2690260708332062, "learning_rate": 4.130246994974952e-05, "loss": 0.7813, "num_input_tokens_seen": 13414160, "step": 23265 }, { "epoch": 3.4658921656240693, "grad_norm": 0.2911822199821472, "learning_rate": 4.129754240522513e-05, "loss": 0.8114, "num_input_tokens_seen": 13417104, "step": 23270 }, { "epoch": 3.466636878165028, "grad_norm": 0.3134201467037201, "learning_rate": 4.12926137593791e-05, "loss": 0.8272, "num_input_tokens_seen": 13420144, "step": 23275 }, { "epoch": 3.4673815907059877, "grad_norm": 0.22892902791500092, "learning_rate": 4.128768401254446e-05, "loss": 0.7822, "num_input_tokens_seen": 13423216, "step": 23280 }, { "epoch": 3.4681263032469465, "grad_norm": 0.23900924623012543, "learning_rate": 4.128275316505435e-05, "loss": 0.8228, "num_input_tokens_seen": 13425904, "step": 23285 }, { "epoch": 3.4688710157879057, "grad_norm": 0.2693867087364197, "learning_rate": 4.1277821217242e-05, "loss": 0.8243, "num_input_tokens_seen": 13428912, "step": 23290 }, { "epoch": 3.469615728328865, "grad_norm": 0.29238370060920715, "learning_rate": 4.127288816944066e-05, "loss": 0.801, "num_input_tokens_seen": 13431920, "step": 23295 }, { "epoch": 3.470360440869824, "grad_norm": 0.2314627319574356, "learning_rate": 4.12679540219837e-05, "loss": 0.8034, "num_input_tokens_seen": 13434384, "step": 23300 }, { "epoch": 3.4711051534107833, "grad_norm": 0.24502909183502197, "learning_rate": 4.126301877520456e-05, "loss": 0.81, "num_input_tokens_seen": 13437168, "step": 23305 }, { "epoch": 3.4718498659517425, "grad_norm": 0.2301337718963623, "learning_rate": 4.125808242943672e-05, "loss": 0.7979, "num_input_tokens_seen": 13439952, "step": 23310 }, { "epoch": 3.4725945784927017, "grad_norm": 0.22790290415287018, "learning_rate": 4.125314498501377e-05, "loss": 0.7942, "num_input_tokens_seen": 13442544, "step": 23315 }, { "epoch": 3.473339291033661, "grad_norm": 0.2125188559293747, "learning_rate": 4.124820644226936e-05, "loss": 0.7921, "num_input_tokens_seen": 13445296, "step": 23320 }, { "epoch": 3.47408400357462, "grad_norm": 0.22407065331935883, "learning_rate": 4.124326680153723e-05, "loss": 0.8302, "num_input_tokens_seen": 13448112, "step": 23325 }, { "epoch": 3.4748287161155793, "grad_norm": 0.20600895583629608, "learning_rate": 4.1238326063151164e-05, "loss": 0.8135, "num_input_tokens_seen": 13450896, "step": 23330 }, { "epoch": 3.4755734286565385, "grad_norm": 0.31404003500938416, "learning_rate": 4.1233384227445036e-05, "loss": 0.81, "num_input_tokens_seen": 13453968, "step": 23335 }, { "epoch": 3.4763181411974977, "grad_norm": 0.22032007575035095, "learning_rate": 4.122844129475281e-05, "loss": 0.7994, "num_input_tokens_seen": 13457136, "step": 23340 }, { "epoch": 3.477062853738457, "grad_norm": 0.2704406678676605, "learning_rate": 4.1223497265408505e-05, "loss": 0.8105, "num_input_tokens_seen": 13459984, "step": 23345 }, { "epoch": 3.477807566279416, "grad_norm": 0.2637990415096283, "learning_rate": 4.12185521397462e-05, "loss": 0.7819, "num_input_tokens_seen": 13462960, "step": 23350 }, { "epoch": 3.4785522788203753, "grad_norm": 0.2879294455051422, "learning_rate": 4.12136059181001e-05, "loss": 0.8286, "num_input_tokens_seen": 13465584, "step": 23355 }, { "epoch": 3.4792969913613345, "grad_norm": 0.16461509466171265, "learning_rate": 4.1208658600804416e-05, "loss": 0.8188, "num_input_tokens_seen": 13468336, "step": 23360 }, { "epoch": 3.4800417039022937, "grad_norm": 0.2130557894706726, "learning_rate": 4.120371018819349e-05, "loss": 0.8082, "num_input_tokens_seen": 13471120, "step": 23365 }, { "epoch": 3.480786416443253, "grad_norm": 0.2531171441078186, "learning_rate": 4.1198760680601713e-05, "loss": 0.8089, "num_input_tokens_seen": 13474064, "step": 23370 }, { "epoch": 3.481531128984212, "grad_norm": 0.27293363213539124, "learning_rate": 4.1193810078363544e-05, "loss": 0.786, "num_input_tokens_seen": 13477168, "step": 23375 }, { "epoch": 3.4822758415251713, "grad_norm": 0.2957358956336975, "learning_rate": 4.1188858381813524e-05, "loss": 0.8178, "num_input_tokens_seen": 13480080, "step": 23380 }, { "epoch": 3.4830205540661305, "grad_norm": 0.29879599809646606, "learning_rate": 4.118390559128629e-05, "loss": 0.8201, "num_input_tokens_seen": 13483120, "step": 23385 }, { "epoch": 3.4837652666070897, "grad_norm": 0.2590026557445526, "learning_rate": 4.11789517071165e-05, "loss": 0.7964, "num_input_tokens_seen": 13486064, "step": 23390 }, { "epoch": 3.484509979148049, "grad_norm": 0.30233249068260193, "learning_rate": 4.117399672963893e-05, "loss": 0.8081, "num_input_tokens_seen": 13489136, "step": 23395 }, { "epoch": 3.485254691689008, "grad_norm": 0.23154863715171814, "learning_rate": 4.116904065918843e-05, "loss": 0.793, "num_input_tokens_seen": 13492144, "step": 23400 }, { "epoch": 3.4859994042299673, "grad_norm": 0.2635228633880615, "learning_rate": 4.11640834960999e-05, "loss": 0.8219, "num_input_tokens_seen": 13495120, "step": 23405 }, { "epoch": 3.4867441167709265, "grad_norm": 0.24695424735546112, "learning_rate": 4.115912524070832e-05, "loss": 0.8166, "num_input_tokens_seen": 13498192, "step": 23410 }, { "epoch": 3.4874888293118858, "grad_norm": 0.2438945323228836, "learning_rate": 4.1154165893348754e-05, "loss": 0.7895, "num_input_tokens_seen": 13500784, "step": 23415 }, { "epoch": 3.488233541852845, "grad_norm": 0.2457999289035797, "learning_rate": 4.114920545435634e-05, "loss": 0.8133, "num_input_tokens_seen": 13503536, "step": 23420 }, { "epoch": 3.488978254393804, "grad_norm": 0.23672892153263092, "learning_rate": 4.114424392406628e-05, "loss": 0.8077, "num_input_tokens_seen": 13506384, "step": 23425 }, { "epoch": 3.4897229669347634, "grad_norm": 0.22590743005275726, "learning_rate": 4.113928130281385e-05, "loss": 0.7843, "num_input_tokens_seen": 13509168, "step": 23430 }, { "epoch": 3.4904676794757226, "grad_norm": 0.29215332865715027, "learning_rate": 4.113431759093441e-05, "loss": 0.8196, "num_input_tokens_seen": 13512080, "step": 23435 }, { "epoch": 3.4912123920166813, "grad_norm": 0.27125197649002075, "learning_rate": 4.112935278876338e-05, "loss": 0.8308, "num_input_tokens_seen": 13515152, "step": 23440 }, { "epoch": 3.491957104557641, "grad_norm": 0.2771326005458832, "learning_rate": 4.112438689663627e-05, "loss": 0.8098, "num_input_tokens_seen": 13518192, "step": 23445 }, { "epoch": 3.4927018170985997, "grad_norm": 0.22104839980602264, "learning_rate": 4.1119419914888645e-05, "loss": 0.8245, "num_input_tokens_seen": 13520880, "step": 23450 }, { "epoch": 3.4934465296395594, "grad_norm": 0.1853637844324112, "learning_rate": 4.111445184385616e-05, "loss": 0.796, "num_input_tokens_seen": 13523728, "step": 23455 }, { "epoch": 3.494191242180518, "grad_norm": 0.17548029124736786, "learning_rate": 4.110948268387455e-05, "loss": 0.8076, "num_input_tokens_seen": 13526736, "step": 23460 }, { "epoch": 3.4949359547214773, "grad_norm": 0.26516446471214294, "learning_rate": 4.110451243527957e-05, "loss": 0.7726, "num_input_tokens_seen": 13529872, "step": 23465 }, { "epoch": 3.4956806672624365, "grad_norm": 0.2403828501701355, "learning_rate": 4.109954109840714e-05, "loss": 0.7725, "num_input_tokens_seen": 13532752, "step": 23470 }, { "epoch": 3.4964253798033957, "grad_norm": 0.229392409324646, "learning_rate": 4.109456867359317e-05, "loss": 0.8121, "num_input_tokens_seen": 13536016, "step": 23475 }, { "epoch": 3.497170092344355, "grad_norm": 0.2918967008590698, "learning_rate": 4.108959516117368e-05, "loss": 0.8154, "num_input_tokens_seen": 13538896, "step": 23480 }, { "epoch": 3.497914804885314, "grad_norm": 0.30166369676589966, "learning_rate": 4.108462056148477e-05, "loss": 0.8017, "num_input_tokens_seen": 13541840, "step": 23485 }, { "epoch": 3.4986595174262733, "grad_norm": 0.24853834509849548, "learning_rate": 4.10796448748626e-05, "loss": 0.8049, "num_input_tokens_seen": 13544784, "step": 23490 }, { "epoch": 3.4994042299672325, "grad_norm": 0.25792577862739563, "learning_rate": 4.10746681016434e-05, "loss": 0.8214, "num_input_tokens_seen": 13547760, "step": 23495 }, { "epoch": 3.5, "eval_loss": 0.8087077736854553, "eval_runtime": 45.4148, "eval_samples_per_second": 65.705, "eval_steps_per_second": 16.426, "num_input_tokens_seen": 13550032, "step": 23499 }, { "epoch": 3.5001489425081918, "grad_norm": 0.20592100918293, "learning_rate": 4.1069690242163484e-05, "loss": 0.805, "num_input_tokens_seen": 13550544, "step": 23500 }, { "epoch": 3.500893655049151, "grad_norm": 0.19253993034362793, "learning_rate": 4.106471129675924e-05, "loss": 0.7889, "num_input_tokens_seen": 13553264, "step": 23505 }, { "epoch": 3.50163836759011, "grad_norm": 0.2737029492855072, "learning_rate": 4.105973126576712e-05, "loss": 0.8122, "num_input_tokens_seen": 13555984, "step": 23510 }, { "epoch": 3.5023830801310694, "grad_norm": 0.26341113448143005, "learning_rate": 4.105475014952365e-05, "loss": 0.7643, "num_input_tokens_seen": 13559024, "step": 23515 }, { "epoch": 3.5031277926720286, "grad_norm": 0.26306548714637756, "learning_rate": 4.104976794836545e-05, "loss": 0.8082, "num_input_tokens_seen": 13561968, "step": 23520 }, { "epoch": 3.5038725052129878, "grad_norm": 0.21840399503707886, "learning_rate": 4.104478466262917e-05, "loss": 0.7953, "num_input_tokens_seen": 13564944, "step": 23525 }, { "epoch": 3.504617217753947, "grad_norm": 0.21285803616046906, "learning_rate": 4.1039800292651584e-05, "loss": 0.8104, "num_input_tokens_seen": 13567632, "step": 23530 }, { "epoch": 3.505361930294906, "grad_norm": 0.3098803758621216, "learning_rate": 4.103481483876951e-05, "loss": 0.7765, "num_input_tokens_seen": 13571088, "step": 23535 }, { "epoch": 3.5061066428358654, "grad_norm": 0.21559129655361176, "learning_rate": 4.1029828301319836e-05, "loss": 0.7949, "num_input_tokens_seen": 13573840, "step": 23540 }, { "epoch": 3.5068513553768246, "grad_norm": 0.24457983672618866, "learning_rate": 4.102484068063954e-05, "loss": 0.7893, "num_input_tokens_seen": 13576624, "step": 23545 }, { "epoch": 3.5075960679177838, "grad_norm": 0.24788793921470642, "learning_rate": 4.1019851977065674e-05, "loss": 0.8159, "num_input_tokens_seen": 13579472, "step": 23550 }, { "epoch": 3.508340780458743, "grad_norm": 0.22092057764530182, "learning_rate": 4.101486219093533e-05, "loss": 0.8314, "num_input_tokens_seen": 13582288, "step": 23555 }, { "epoch": 3.509085492999702, "grad_norm": 0.2845991551876068, "learning_rate": 4.100987132258571e-05, "loss": 0.7945, "num_input_tokens_seen": 13585200, "step": 23560 }, { "epoch": 3.5098302055406614, "grad_norm": 0.3374817669391632, "learning_rate": 4.1004879372354085e-05, "loss": 0.7756, "num_input_tokens_seen": 13587984, "step": 23565 }, { "epoch": 3.5105749180816206, "grad_norm": 0.17651398479938507, "learning_rate": 4.099988634057778e-05, "loss": 0.7904, "num_input_tokens_seen": 13591024, "step": 23570 }, { "epoch": 3.51131963062258, "grad_norm": 0.22223930060863495, "learning_rate": 4.09948922275942e-05, "loss": 0.797, "num_input_tokens_seen": 13593904, "step": 23575 }, { "epoch": 3.512064343163539, "grad_norm": 0.248775914311409, "learning_rate": 4.098989703374084e-05, "loss": 0.8132, "num_input_tokens_seen": 13597072, "step": 23580 }, { "epoch": 3.512809055704498, "grad_norm": 0.23003238439559937, "learning_rate": 4.0984900759355254e-05, "loss": 0.7871, "num_input_tokens_seen": 13600176, "step": 23585 }, { "epoch": 3.5135537682454574, "grad_norm": 0.30061450600624084, "learning_rate": 4.097990340477507e-05, "loss": 0.8026, "num_input_tokens_seen": 13603088, "step": 23590 }, { "epoch": 3.5142984807864166, "grad_norm": 0.31452929973602295, "learning_rate": 4.097490497033797e-05, "loss": 0.7976, "num_input_tokens_seen": 13605968, "step": 23595 }, { "epoch": 3.515043193327376, "grad_norm": 0.30659717321395874, "learning_rate": 4.096990545638174e-05, "loss": 0.8067, "num_input_tokens_seen": 13608816, "step": 23600 }, { "epoch": 3.5157879058683346, "grad_norm": 0.2943829894065857, "learning_rate": 4.096490486324424e-05, "loss": 0.796, "num_input_tokens_seen": 13611856, "step": 23605 }, { "epoch": 3.516532618409294, "grad_norm": 0.2914053499698639, "learning_rate": 4.095990319126337e-05, "loss": 0.7904, "num_input_tokens_seen": 13614832, "step": 23610 }, { "epoch": 3.517277330950253, "grad_norm": 0.30772584676742554, "learning_rate": 4.0954900440777125e-05, "loss": 0.8129, "num_input_tokens_seen": 13617584, "step": 23615 }, { "epoch": 3.5180220434912126, "grad_norm": 0.3398233950138092, "learning_rate": 4.094989661212359e-05, "loss": 0.8007, "num_input_tokens_seen": 13620560, "step": 23620 }, { "epoch": 3.5187667560321714, "grad_norm": 0.2703867554664612, "learning_rate": 4.094489170564088e-05, "loss": 0.7523, "num_input_tokens_seen": 13623248, "step": 23625 }, { "epoch": 3.519511468573131, "grad_norm": 0.2719556987285614, "learning_rate": 4.0939885721667216e-05, "loss": 0.7686, "num_input_tokens_seen": 13626320, "step": 23630 }, { "epoch": 3.5202561811140898, "grad_norm": 0.2146788388490677, "learning_rate": 4.093487866054088e-05, "loss": 0.7983, "num_input_tokens_seen": 13629296, "step": 23635 }, { "epoch": 3.5210008936550494, "grad_norm": 0.16761259734630585, "learning_rate": 4.0929870522600233e-05, "loss": 0.8514, "num_input_tokens_seen": 13632112, "step": 23640 }, { "epoch": 3.521745606196008, "grad_norm": 0.30650725960731506, "learning_rate": 4.092486130818371e-05, "loss": 0.8455, "num_input_tokens_seen": 13635184, "step": 23645 }, { "epoch": 3.5224903187369674, "grad_norm": 0.2484157234430313, "learning_rate": 4.09198510176298e-05, "loss": 0.8387, "num_input_tokens_seen": 13638256, "step": 23650 }, { "epoch": 3.5232350312779266, "grad_norm": 0.34236833453178406, "learning_rate": 4.091483965127708e-05, "loss": 0.8145, "num_input_tokens_seen": 13641040, "step": 23655 }, { "epoch": 3.523979743818886, "grad_norm": 0.3889801800251007, "learning_rate": 4.09098272094642e-05, "loss": 0.8432, "num_input_tokens_seen": 13643664, "step": 23660 }, { "epoch": 3.524724456359845, "grad_norm": 0.20715108513832092, "learning_rate": 4.0904813692529886e-05, "loss": 0.7915, "num_input_tokens_seen": 13646544, "step": 23665 }, { "epoch": 3.525469168900804, "grad_norm": 0.22052401304244995, "learning_rate": 4.089979910081293e-05, "loss": 0.7983, "num_input_tokens_seen": 13649616, "step": 23670 }, { "epoch": 3.5262138814417634, "grad_norm": 0.21448928117752075, "learning_rate": 4.089478343465219e-05, "loss": 0.7927, "num_input_tokens_seen": 13652432, "step": 23675 }, { "epoch": 3.5269585939827226, "grad_norm": 0.2819749712944031, "learning_rate": 4.088976669438661e-05, "loss": 0.7993, "num_input_tokens_seen": 13654928, "step": 23680 }, { "epoch": 3.527703306523682, "grad_norm": 0.41068458557128906, "learning_rate": 4.088474888035519e-05, "loss": 0.8173, "num_input_tokens_seen": 13658096, "step": 23685 }, { "epoch": 3.528448019064641, "grad_norm": 0.254107266664505, "learning_rate": 4.087972999289704e-05, "loss": 0.8255, "num_input_tokens_seen": 13661008, "step": 23690 }, { "epoch": 3.5291927316056, "grad_norm": 0.2432793229818344, "learning_rate": 4.0874710032351296e-05, "loss": 0.7956, "num_input_tokens_seen": 13663792, "step": 23695 }, { "epoch": 3.5299374441465594, "grad_norm": 0.24307125806808472, "learning_rate": 4.086968899905719e-05, "loss": 0.7905, "num_input_tokens_seen": 13666576, "step": 23700 }, { "epoch": 3.5306821566875186, "grad_norm": 0.24511775374412537, "learning_rate": 4.086466689335402e-05, "loss": 0.8239, "num_input_tokens_seen": 13669360, "step": 23705 }, { "epoch": 3.531426869228478, "grad_norm": 0.2183266431093216, "learning_rate": 4.085964371558116e-05, "loss": 0.8207, "num_input_tokens_seen": 13672240, "step": 23710 }, { "epoch": 3.532171581769437, "grad_norm": 0.211689293384552, "learning_rate": 4.085461946607806e-05, "loss": 0.7834, "num_input_tokens_seen": 13675472, "step": 23715 }, { "epoch": 3.532916294310396, "grad_norm": 0.27049270272254944, "learning_rate": 4.084959414518423e-05, "loss": 0.7907, "num_input_tokens_seen": 13678384, "step": 23720 }, { "epoch": 3.5336610068513554, "grad_norm": 0.2711082398891449, "learning_rate": 4.0844567753239276e-05, "loss": 0.8117, "num_input_tokens_seen": 13680848, "step": 23725 }, { "epoch": 3.5344057193923146, "grad_norm": 0.33340054750442505, "learning_rate": 4.0839540290582856e-05, "loss": 0.8008, "num_input_tokens_seen": 13684208, "step": 23730 }, { "epoch": 3.535150431933274, "grad_norm": 0.22151383757591248, "learning_rate": 4.08345117575547e-05, "loss": 0.8395, "num_input_tokens_seen": 13687120, "step": 23735 }, { "epoch": 3.535895144474233, "grad_norm": 0.3067363202571869, "learning_rate": 4.082948215449461e-05, "loss": 0.7692, "num_input_tokens_seen": 13690096, "step": 23740 }, { "epoch": 3.5366398570151922, "grad_norm": 0.28958824276924133, "learning_rate": 4.0824451481742475e-05, "loss": 0.7965, "num_input_tokens_seen": 13693360, "step": 23745 }, { "epoch": 3.5373845695561514, "grad_norm": 0.3172234296798706, "learning_rate": 4.081941973963825e-05, "loss": 0.8255, "num_input_tokens_seen": 13696464, "step": 23750 }, { "epoch": 3.5381292820971106, "grad_norm": 0.2869621813297272, "learning_rate": 4.0814386928521964e-05, "loss": 0.8172, "num_input_tokens_seen": 13699312, "step": 23755 }, { "epoch": 3.53887399463807, "grad_norm": 0.3114333748817444, "learning_rate": 4.0809353048733696e-05, "loss": 0.8296, "num_input_tokens_seen": 13702096, "step": 23760 }, { "epoch": 3.539618707179029, "grad_norm": 0.3185190260410309, "learning_rate": 4.0804318100613624e-05, "loss": 0.8085, "num_input_tokens_seen": 13704976, "step": 23765 }, { "epoch": 3.5403634197199882, "grad_norm": 0.2373967170715332, "learning_rate": 4.0799282084502e-05, "loss": 0.8115, "num_input_tokens_seen": 13707984, "step": 23770 }, { "epoch": 3.5411081322609474, "grad_norm": 0.25749829411506653, "learning_rate": 4.079424500073912e-05, "loss": 0.7996, "num_input_tokens_seen": 13710864, "step": 23775 }, { "epoch": 3.541852844801906, "grad_norm": 0.40616264939308167, "learning_rate": 4.078920684966538e-05, "loss": 0.8039, "num_input_tokens_seen": 13713808, "step": 23780 }, { "epoch": 3.542597557342866, "grad_norm": 0.24925103783607483, "learning_rate": 4.078416763162123e-05, "loss": 0.7704, "num_input_tokens_seen": 13716496, "step": 23785 }, { "epoch": 3.5433422698838246, "grad_norm": 0.2681008279323578, "learning_rate": 4.0779127346947214e-05, "loss": 0.7686, "num_input_tokens_seen": 13719088, "step": 23790 }, { "epoch": 3.5440869824247843, "grad_norm": 0.2529085576534271, "learning_rate": 4.077408599598392e-05, "loss": 0.7678, "num_input_tokens_seen": 13722128, "step": 23795 }, { "epoch": 3.544831694965743, "grad_norm": 0.24182207882404327, "learning_rate": 4.076904357907203e-05, "loss": 0.7884, "num_input_tokens_seen": 13725008, "step": 23800 }, { "epoch": 3.5455764075067027, "grad_norm": 0.2692389190196991, "learning_rate": 4.076400009655228e-05, "loss": 0.7965, "num_input_tokens_seen": 13727856, "step": 23805 }, { "epoch": 3.5463211200476614, "grad_norm": 0.24233178794384003, "learning_rate": 4.0758955548765505e-05, "loss": 0.769, "num_input_tokens_seen": 13730736, "step": 23810 }, { "epoch": 3.5470658325886206, "grad_norm": 0.26278048753738403, "learning_rate": 4.075390993605258e-05, "loss": 0.8152, "num_input_tokens_seen": 13733520, "step": 23815 }, { "epoch": 3.54781054512958, "grad_norm": 0.22529292106628418, "learning_rate": 4.074886325875447e-05, "loss": 0.8306, "num_input_tokens_seen": 13736624, "step": 23820 }, { "epoch": 3.548555257670539, "grad_norm": 0.19953016936779022, "learning_rate": 4.074381551721221e-05, "loss": 0.7628, "num_input_tokens_seen": 13739440, "step": 23825 }, { "epoch": 3.5492999702114982, "grad_norm": 0.2827679216861725, "learning_rate": 4.073876671176692e-05, "loss": 0.7553, "num_input_tokens_seen": 13742352, "step": 23830 }, { "epoch": 3.5500446827524574, "grad_norm": 0.2680150866508484, "learning_rate": 4.073371684275976e-05, "loss": 0.7853, "num_input_tokens_seen": 13745616, "step": 23835 }, { "epoch": 3.5507893952934166, "grad_norm": 0.2983732223510742, "learning_rate": 4.072866591053197e-05, "loss": 0.7884, "num_input_tokens_seen": 13748560, "step": 23840 }, { "epoch": 3.551534107834376, "grad_norm": 0.24997791647911072, "learning_rate": 4.0723613915424894e-05, "loss": 0.7612, "num_input_tokens_seen": 13751696, "step": 23845 }, { "epoch": 3.552278820375335, "grad_norm": 0.2875823676586151, "learning_rate": 4.071856085777993e-05, "loss": 0.814, "num_input_tokens_seen": 13754704, "step": 23850 }, { "epoch": 3.5530235329162942, "grad_norm": 0.262459397315979, "learning_rate": 4.071350673793852e-05, "loss": 0.7984, "num_input_tokens_seen": 13757520, "step": 23855 }, { "epoch": 3.5537682454572534, "grad_norm": 0.4386597275733948, "learning_rate": 4.070845155624221e-05, "loss": 0.8648, "num_input_tokens_seen": 13760528, "step": 23860 }, { "epoch": 3.5545129579982127, "grad_norm": 0.1749204695224762, "learning_rate": 4.070339531303261e-05, "loss": 0.7578, "num_input_tokens_seen": 13763248, "step": 23865 }, { "epoch": 3.555257670539172, "grad_norm": 0.22470562160015106, "learning_rate": 4.0698338008651405e-05, "loss": 0.7738, "num_input_tokens_seen": 13766000, "step": 23870 }, { "epoch": 3.556002383080131, "grad_norm": 0.26996996998786926, "learning_rate": 4.0693279643440326e-05, "loss": 0.8342, "num_input_tokens_seen": 13768944, "step": 23875 }, { "epoch": 3.5567470956210903, "grad_norm": 0.27672523260116577, "learning_rate": 4.068822021774123e-05, "loss": 0.8289, "num_input_tokens_seen": 13771824, "step": 23880 }, { "epoch": 3.5574918081620495, "grad_norm": 0.2568104863166809, "learning_rate": 4.0683159731895994e-05, "loss": 0.7661, "num_input_tokens_seen": 13775056, "step": 23885 }, { "epoch": 3.5582365207030087, "grad_norm": 0.3206459581851959, "learning_rate": 4.067809818624658e-05, "loss": 0.832, "num_input_tokens_seen": 13777968, "step": 23890 }, { "epoch": 3.558981233243968, "grad_norm": 0.17042383551597595, "learning_rate": 4.067303558113503e-05, "loss": 0.7872, "num_input_tokens_seen": 13780688, "step": 23895 }, { "epoch": 3.559725945784927, "grad_norm": 0.2196260392665863, "learning_rate": 4.066797191690347e-05, "loss": 0.7871, "num_input_tokens_seen": 13783312, "step": 23900 }, { "epoch": 3.5604706583258863, "grad_norm": 0.2353404462337494, "learning_rate": 4.066290719389406e-05, "loss": 0.7877, "num_input_tokens_seen": 13786096, "step": 23905 }, { "epoch": 3.5612153708668455, "grad_norm": 0.36182478070259094, "learning_rate": 4.065784141244907e-05, "loss": 0.8312, "num_input_tokens_seen": 13788752, "step": 23910 }, { "epoch": 3.5619600834078047, "grad_norm": 0.24078814685344696, "learning_rate": 4.065277457291081e-05, "loss": 0.7649, "num_input_tokens_seen": 13791664, "step": 23915 }, { "epoch": 3.562704795948764, "grad_norm": 0.28970882296562195, "learning_rate": 4.0647706675621685e-05, "loss": 0.828, "num_input_tokens_seen": 13794576, "step": 23920 }, { "epoch": 3.563449508489723, "grad_norm": 0.27477553486824036, "learning_rate": 4.064263772092416e-05, "loss": 0.8088, "num_input_tokens_seen": 13797360, "step": 23925 }, { "epoch": 3.5641942210306823, "grad_norm": 0.4258468747138977, "learning_rate": 4.0637567709160786e-05, "loss": 0.7816, "num_input_tokens_seen": 13799888, "step": 23930 }, { "epoch": 3.5649389335716415, "grad_norm": 0.20482639968395233, "learning_rate": 4.0632496640674156e-05, "loss": 0.7843, "num_input_tokens_seen": 13802832, "step": 23935 }, { "epoch": 3.5656836461126007, "grad_norm": 0.28310370445251465, "learning_rate": 4.0627424515806957e-05, "loss": 0.8008, "num_input_tokens_seen": 13806064, "step": 23940 }, { "epoch": 3.5664283586535594, "grad_norm": 0.26324984431266785, "learning_rate": 4.062235133490195e-05, "loss": 0.7977, "num_input_tokens_seen": 13809072, "step": 23945 }, { "epoch": 3.567173071194519, "grad_norm": 0.20994754135608673, "learning_rate": 4.061727709830196e-05, "loss": 0.8301, "num_input_tokens_seen": 13812208, "step": 23950 }, { "epoch": 3.567917783735478, "grad_norm": 0.3080637753009796, "learning_rate": 4.061220180634987e-05, "loss": 0.8172, "num_input_tokens_seen": 13815024, "step": 23955 }, { "epoch": 3.5686624962764375, "grad_norm": 0.2827250361442566, "learning_rate": 4.060712545938866e-05, "loss": 0.8229, "num_input_tokens_seen": 13817808, "step": 23960 }, { "epoch": 3.5694072088173963, "grad_norm": 0.21795743703842163, "learning_rate": 4.0602048057761365e-05, "loss": 0.7917, "num_input_tokens_seen": 13820624, "step": 23965 }, { "epoch": 3.570151921358356, "grad_norm": 0.20788735151290894, "learning_rate": 4.0596969601811095e-05, "loss": 0.7687, "num_input_tokens_seen": 13823376, "step": 23970 }, { "epoch": 3.5708966338993147, "grad_norm": 0.36457809805870056, "learning_rate": 4.059189009188104e-05, "loss": 0.814, "num_input_tokens_seen": 13826352, "step": 23975 }, { "epoch": 3.5716413464402743, "grad_norm": 0.2502167522907257, "learning_rate": 4.058680952831444e-05, "loss": 0.7895, "num_input_tokens_seen": 13829040, "step": 23980 }, { "epoch": 3.572386058981233, "grad_norm": 0.27508872747421265, "learning_rate": 4.058172791145461e-05, "loss": 0.8081, "num_input_tokens_seen": 13832336, "step": 23985 }, { "epoch": 3.5731307715221923, "grad_norm": 0.23543915152549744, "learning_rate": 4.0576645241644985e-05, "loss": 0.823, "num_input_tokens_seen": 13835024, "step": 23990 }, { "epoch": 3.5738754840631515, "grad_norm": 0.3303966522216797, "learning_rate": 4.0571561519228984e-05, "loss": 0.8019, "num_input_tokens_seen": 13837680, "step": 23995 }, { "epoch": 3.5746201966041107, "grad_norm": 0.30896130204200745, "learning_rate": 4.056647674455017e-05, "loss": 0.8013, "num_input_tokens_seen": 13840560, "step": 24000 }, { "epoch": 3.57536490914507, "grad_norm": 0.39326944947242737, "learning_rate": 4.056139091795215e-05, "loss": 0.801, "num_input_tokens_seen": 13843376, "step": 24005 }, { "epoch": 3.576109621686029, "grad_norm": 0.25884029269218445, "learning_rate": 4.05563040397786e-05, "loss": 0.8085, "num_input_tokens_seen": 13846224, "step": 24010 }, { "epoch": 3.5768543342269883, "grad_norm": 0.2909677028656006, "learning_rate": 4.055121611037326e-05, "loss": 0.8003, "num_input_tokens_seen": 13849360, "step": 24015 }, { "epoch": 3.5775990467679475, "grad_norm": 0.29567408561706543, "learning_rate": 4.054612713007997e-05, "loss": 0.7896, "num_input_tokens_seen": 13852368, "step": 24020 }, { "epoch": 3.5783437593089067, "grad_norm": 0.3189842402935028, "learning_rate": 4.054103709924262e-05, "loss": 0.8049, "num_input_tokens_seen": 13855536, "step": 24025 }, { "epoch": 3.579088471849866, "grad_norm": 0.24761934578418732, "learning_rate": 4.0535946018205156e-05, "loss": 0.815, "num_input_tokens_seen": 13858288, "step": 24030 }, { "epoch": 3.579833184390825, "grad_norm": 0.3127354383468628, "learning_rate": 4.0530853887311634e-05, "loss": 0.7829, "num_input_tokens_seen": 13860880, "step": 24035 }, { "epoch": 3.5805778969317843, "grad_norm": 0.29244813323020935, "learning_rate": 4.052576070690615e-05, "loss": 0.7904, "num_input_tokens_seen": 13863792, "step": 24040 }, { "epoch": 3.5813226094727435, "grad_norm": 0.22728818655014038, "learning_rate": 4.052066647733287e-05, "loss": 0.801, "num_input_tokens_seen": 13866704, "step": 24045 }, { "epoch": 3.5820673220137027, "grad_norm": 0.28830432891845703, "learning_rate": 4.051557119893606e-05, "loss": 0.8025, "num_input_tokens_seen": 13869616, "step": 24050 }, { "epoch": 3.582812034554662, "grad_norm": 0.2869107127189636, "learning_rate": 4.051047487206003e-05, "loss": 0.8166, "num_input_tokens_seen": 13872496, "step": 24055 }, { "epoch": 3.583556747095621, "grad_norm": 0.31290188431739807, "learning_rate": 4.050537749704917e-05, "loss": 0.8295, "num_input_tokens_seen": 13875376, "step": 24060 }, { "epoch": 3.5843014596365803, "grad_norm": 0.3162446618080139, "learning_rate": 4.050027907424794e-05, "loss": 0.7835, "num_input_tokens_seen": 13878160, "step": 24065 }, { "epoch": 3.5850461721775395, "grad_norm": 0.25126388669013977, "learning_rate": 4.049517960400086e-05, "loss": 0.7943, "num_input_tokens_seen": 13880912, "step": 24070 }, { "epoch": 3.5857908847184987, "grad_norm": 0.372679740190506, "learning_rate": 4.049007908665255e-05, "loss": 0.8352, "num_input_tokens_seen": 13883856, "step": 24075 }, { "epoch": 3.586535597259458, "grad_norm": 0.3287745416164398, "learning_rate": 4.0484977522547676e-05, "loss": 0.8128, "num_input_tokens_seen": 13886768, "step": 24080 }, { "epoch": 3.587280309800417, "grad_norm": 0.19010846316814423, "learning_rate": 4.047987491203097e-05, "loss": 0.7702, "num_input_tokens_seen": 13889488, "step": 24085 }, { "epoch": 3.5880250223413763, "grad_norm": 0.24950246512889862, "learning_rate": 4.0474771255447256e-05, "loss": 0.789, "num_input_tokens_seen": 13892336, "step": 24090 }, { "epoch": 3.5887697348823355, "grad_norm": 0.2154237926006317, "learning_rate": 4.046966655314142e-05, "loss": 0.8152, "num_input_tokens_seen": 13895152, "step": 24095 }, { "epoch": 3.5895144474232947, "grad_norm": 0.22965523600578308, "learning_rate": 4.0464560805458405e-05, "loss": 0.824, "num_input_tokens_seen": 13898000, "step": 24100 }, { "epoch": 3.590259159964254, "grad_norm": 0.1947767734527588, "learning_rate": 4.045945401274326e-05, "loss": 0.7975, "num_input_tokens_seen": 13900880, "step": 24105 }, { "epoch": 3.591003872505213, "grad_norm": 0.4108758866786957, "learning_rate": 4.0454346175341054e-05, "loss": 0.8427, "num_input_tokens_seen": 13903952, "step": 24110 }, { "epoch": 3.5917485850461723, "grad_norm": 0.42124706506729126, "learning_rate": 4.0449237293596975e-05, "loss": 0.7968, "num_input_tokens_seen": 13906768, "step": 24115 }, { "epoch": 3.592493297587131, "grad_norm": 0.2973896861076355, "learning_rate": 4.0444127367856246e-05, "loss": 0.8176, "num_input_tokens_seen": 13909520, "step": 24120 }, { "epoch": 3.5932380101280907, "grad_norm": 0.2927597165107727, "learning_rate": 4.043901639846418e-05, "loss": 0.7542, "num_input_tokens_seen": 13912464, "step": 24125 }, { "epoch": 3.5939827226690495, "grad_norm": 0.28795337677001953, "learning_rate": 4.043390438576616e-05, "loss": 0.7894, "num_input_tokens_seen": 13915536, "step": 24130 }, { "epoch": 3.594727435210009, "grad_norm": 0.3634902536869049, "learning_rate": 4.042879133010763e-05, "loss": 0.7999, "num_input_tokens_seen": 13918384, "step": 24135 }, { "epoch": 3.595472147750968, "grad_norm": 0.23885424435138702, "learning_rate": 4.042367723183411e-05, "loss": 0.7631, "num_input_tokens_seen": 13921648, "step": 24140 }, { "epoch": 3.5962168602919276, "grad_norm": 0.24670487642288208, "learning_rate": 4.041856209129119e-05, "loss": 0.7632, "num_input_tokens_seen": 13924176, "step": 24145 }, { "epoch": 3.5969615728328863, "grad_norm": 0.28153765201568604, "learning_rate": 4.0413445908824534e-05, "loss": 0.8181, "num_input_tokens_seen": 13926928, "step": 24150 }, { "epoch": 3.597706285373846, "grad_norm": 0.24195417761802673, "learning_rate": 4.040832868477987e-05, "loss": 0.8148, "num_input_tokens_seen": 13929616, "step": 24155 }, { "epoch": 3.5984509979148047, "grad_norm": 0.20478898286819458, "learning_rate": 4.040321041950299e-05, "loss": 0.8366, "num_input_tokens_seen": 13932368, "step": 24160 }, { "epoch": 3.599195710455764, "grad_norm": 0.2432299107313156, "learning_rate": 4.039809111333979e-05, "loss": 0.8115, "num_input_tokens_seen": 13935120, "step": 24165 }, { "epoch": 3.599940422996723, "grad_norm": 0.24565504491329193, "learning_rate": 4.039297076663619e-05, "loss": 0.7328, "num_input_tokens_seen": 13937904, "step": 24170 }, { "epoch": 3.6006851355376823, "grad_norm": 0.22379310429096222, "learning_rate": 4.03878493797382e-05, "loss": 0.8373, "num_input_tokens_seen": 13940688, "step": 24175 }, { "epoch": 3.6014298480786415, "grad_norm": 0.2982877194881439, "learning_rate": 4.0382726952991924e-05, "loss": 0.8296, "num_input_tokens_seen": 13943504, "step": 24180 }, { "epoch": 3.6021745606196007, "grad_norm": 0.26929065585136414, "learning_rate": 4.037760348674349e-05, "loss": 0.7622, "num_input_tokens_seen": 13946352, "step": 24185 }, { "epoch": 3.60291927316056, "grad_norm": 0.22750356793403625, "learning_rate": 4.037247898133915e-05, "loss": 0.8046, "num_input_tokens_seen": 13949328, "step": 24190 }, { "epoch": 3.603663985701519, "grad_norm": 0.21649888157844543, "learning_rate": 4.036735343712516e-05, "loss": 0.7559, "num_input_tokens_seen": 13952240, "step": 24195 }, { "epoch": 3.6044086982424783, "grad_norm": 0.25017207860946655, "learning_rate": 4.036222685444792e-05, "loss": 0.8356, "num_input_tokens_seen": 13955184, "step": 24200 }, { "epoch": 3.6051534107834375, "grad_norm": 0.28168752789497375, "learning_rate": 4.035709923365384e-05, "loss": 0.7931, "num_input_tokens_seen": 13957904, "step": 24205 }, { "epoch": 3.6058981233243967, "grad_norm": 0.208464115858078, "learning_rate": 4.0351970575089435e-05, "loss": 0.8018, "num_input_tokens_seen": 13960848, "step": 24210 }, { "epoch": 3.606642835865356, "grad_norm": 0.23402033746242523, "learning_rate": 4.0346840879101277e-05, "loss": 0.8519, "num_input_tokens_seen": 13963824, "step": 24215 }, { "epoch": 3.607387548406315, "grad_norm": 0.2787191867828369, "learning_rate": 4.0341710146036e-05, "loss": 0.836, "num_input_tokens_seen": 13966608, "step": 24220 }, { "epoch": 3.6081322609472744, "grad_norm": 0.3460513949394226, "learning_rate": 4.033657837624033e-05, "loss": 0.8103, "num_input_tokens_seen": 13969584, "step": 24225 }, { "epoch": 3.6088769734882336, "grad_norm": 0.26266390085220337, "learning_rate": 4.033144557006104e-05, "loss": 0.7787, "num_input_tokens_seen": 13972464, "step": 24230 }, { "epoch": 3.6096216860291928, "grad_norm": 0.2418290227651596, "learning_rate": 4.032631172784501e-05, "loss": 0.8217, "num_input_tokens_seen": 13975216, "step": 24235 }, { "epoch": 3.610366398570152, "grad_norm": 0.2981438636779785, "learning_rate": 4.0321176849939135e-05, "loss": 0.7914, "num_input_tokens_seen": 13978320, "step": 24240 }, { "epoch": 3.611111111111111, "grad_norm": 0.22762517631053925, "learning_rate": 4.031604093669042e-05, "loss": 0.7645, "num_input_tokens_seen": 13981200, "step": 24245 }, { "epoch": 3.6118558236520704, "grad_norm": 0.32320088148117065, "learning_rate": 4.031090398844593e-05, "loss": 0.7946, "num_input_tokens_seen": 13984112, "step": 24250 }, { "epoch": 3.6126005361930296, "grad_norm": 0.2466600090265274, "learning_rate": 4.030576600555279e-05, "loss": 0.7919, "num_input_tokens_seen": 13986992, "step": 24255 }, { "epoch": 3.6133452487339888, "grad_norm": 0.32032111287117004, "learning_rate": 4.030062698835822e-05, "loss": 0.7869, "num_input_tokens_seen": 13990160, "step": 24260 }, { "epoch": 3.614089961274948, "grad_norm": 0.30264660716056824, "learning_rate": 4.029548693720949e-05, "loss": 0.8353, "num_input_tokens_seen": 13993200, "step": 24265 }, { "epoch": 3.614834673815907, "grad_norm": 0.3258049190044403, "learning_rate": 4.029034585245393e-05, "loss": 0.8105, "num_input_tokens_seen": 13996176, "step": 24270 }, { "epoch": 3.6155793863568664, "grad_norm": 0.34102436900138855, "learning_rate": 4.028520373443897e-05, "loss": 0.7937, "num_input_tokens_seen": 13999152, "step": 24275 }, { "epoch": 3.6163240988978256, "grad_norm": 0.22742396593093872, "learning_rate": 4.028006058351208e-05, "loss": 0.8522, "num_input_tokens_seen": 14001904, "step": 24280 }, { "epoch": 3.617068811438785, "grad_norm": 0.2220556139945984, "learning_rate": 4.027491640002083e-05, "loss": 0.8283, "num_input_tokens_seen": 14004432, "step": 24285 }, { "epoch": 3.617813523979744, "grad_norm": 0.23899534344673157, "learning_rate": 4.0269771184312824e-05, "loss": 0.8047, "num_input_tokens_seen": 14007440, "step": 24290 }, { "epoch": 3.6185582365207027, "grad_norm": 0.23164832592010498, "learning_rate": 4.0264624936735776e-05, "loss": 0.7977, "num_input_tokens_seen": 14010576, "step": 24295 }, { "epoch": 3.6193029490616624, "grad_norm": 0.2040502279996872, "learning_rate": 4.0259477657637424e-05, "loss": 0.8133, "num_input_tokens_seen": 14013328, "step": 24300 }, { "epoch": 3.620047661602621, "grad_norm": 0.2740582525730133, "learning_rate": 4.0254329347365614e-05, "loss": 0.7851, "num_input_tokens_seen": 14016272, "step": 24305 }, { "epoch": 3.620792374143581, "grad_norm": 0.27758944034576416, "learning_rate": 4.024918000626825e-05, "loss": 0.8372, "num_input_tokens_seen": 14018928, "step": 24310 }, { "epoch": 3.6215370866845396, "grad_norm": 0.258861243724823, "learning_rate": 4.024402963469329e-05, "loss": 0.8282, "num_input_tokens_seen": 14021744, "step": 24315 }, { "epoch": 3.622281799225499, "grad_norm": 0.2350691556930542, "learning_rate": 4.02388782329888e-05, "loss": 0.777, "num_input_tokens_seen": 14024304, "step": 24320 }, { "epoch": 3.623026511766458, "grad_norm": 0.381099671125412, "learning_rate": 4.023372580150286e-05, "loss": 0.8266, "num_input_tokens_seen": 14027120, "step": 24325 }, { "epoch": 3.6237712243074176, "grad_norm": 0.2257523536682129, "learning_rate": 4.022857234058368e-05, "loss": 0.7991, "num_input_tokens_seen": 14029840, "step": 24330 }, { "epoch": 3.6245159368483764, "grad_norm": 0.2794959843158722, "learning_rate": 4.022341785057949e-05, "loss": 0.7979, "num_input_tokens_seen": 14032752, "step": 24335 }, { "epoch": 3.6252606493893356, "grad_norm": 0.2876809537410736, "learning_rate": 4.021826233183862e-05, "loss": 0.7996, "num_input_tokens_seen": 14035696, "step": 24340 }, { "epoch": 3.6260053619302948, "grad_norm": 0.2644508481025696, "learning_rate": 4.0213105784709445e-05, "loss": 0.8242, "num_input_tokens_seen": 14038704, "step": 24345 }, { "epoch": 3.626750074471254, "grad_norm": 0.32670071721076965, "learning_rate": 4.020794820954044e-05, "loss": 0.787, "num_input_tokens_seen": 14041424, "step": 24350 }, { "epoch": 3.627494787012213, "grad_norm": 0.23309974372386932, "learning_rate": 4.0202789606680136e-05, "loss": 0.7732, "num_input_tokens_seen": 14044304, "step": 24355 }, { "epoch": 3.6282394995531724, "grad_norm": 0.24067452549934387, "learning_rate": 4.01976299764771e-05, "loss": 0.8023, "num_input_tokens_seen": 14047216, "step": 24360 }, { "epoch": 3.6289842120941316, "grad_norm": 0.2324584424495697, "learning_rate": 4.019246931928004e-05, "loss": 0.7983, "num_input_tokens_seen": 14050032, "step": 24365 }, { "epoch": 3.629728924635091, "grad_norm": 0.16883181035518646, "learning_rate": 4.018730763543765e-05, "loss": 0.8091, "num_input_tokens_seen": 14052592, "step": 24370 }, { "epoch": 3.63047363717605, "grad_norm": 0.3154378831386566, "learning_rate": 4.018214492529877e-05, "loss": 0.8253, "num_input_tokens_seen": 14055440, "step": 24375 }, { "epoch": 3.631218349717009, "grad_norm": 0.318764865398407, "learning_rate": 4.017698118921226e-05, "loss": 0.8274, "num_input_tokens_seen": 14058064, "step": 24380 }, { "epoch": 3.6319630622579684, "grad_norm": 0.22376784682273865, "learning_rate": 4.0171816427527064e-05, "loss": 0.8256, "num_input_tokens_seen": 14060880, "step": 24385 }, { "epoch": 3.6327077747989276, "grad_norm": 0.49416691064834595, "learning_rate": 4.016665064059219e-05, "loss": 0.8023, "num_input_tokens_seen": 14063952, "step": 24390 }, { "epoch": 3.633452487339887, "grad_norm": 0.24361778795719147, "learning_rate": 4.016148382875675e-05, "loss": 0.8245, "num_input_tokens_seen": 14066768, "step": 24395 }, { "epoch": 3.634197199880846, "grad_norm": 0.2894562780857086, "learning_rate": 4.0156315992369864e-05, "loss": 0.8174, "num_input_tokens_seen": 14069712, "step": 24400 }, { "epoch": 3.634941912421805, "grad_norm": 0.21651262044906616, "learning_rate": 4.015114713178077e-05, "loss": 0.7957, "num_input_tokens_seen": 14072720, "step": 24405 }, { "epoch": 3.6356866249627644, "grad_norm": 0.2683543860912323, "learning_rate": 4.014597724733874e-05, "loss": 0.8029, "num_input_tokens_seen": 14075376, "step": 24410 }, { "epoch": 3.6364313375037236, "grad_norm": 0.2777867615222931, "learning_rate": 4.0140806339393156e-05, "loss": 0.8219, "num_input_tokens_seen": 14078192, "step": 24415 }, { "epoch": 3.637176050044683, "grad_norm": 0.2106008529663086, "learning_rate": 4.013563440829343e-05, "loss": 0.7931, "num_input_tokens_seen": 14081072, "step": 24420 }, { "epoch": 3.637920762585642, "grad_norm": 0.3230748176574707, "learning_rate": 4.013046145438908e-05, "loss": 0.8108, "num_input_tokens_seen": 14084080, "step": 24425 }, { "epoch": 3.638665475126601, "grad_norm": 0.2455756962299347, "learning_rate": 4.012528747802965e-05, "loss": 0.8071, "num_input_tokens_seen": 14087024, "step": 24430 }, { "epoch": 3.6394101876675604, "grad_norm": 0.24886152148246765, "learning_rate": 4.0120112479564795e-05, "loss": 0.7958, "num_input_tokens_seen": 14089808, "step": 24435 }, { "epoch": 3.6401549002085196, "grad_norm": 0.2914407551288605, "learning_rate": 4.01149364593442e-05, "loss": 0.7904, "num_input_tokens_seen": 14092784, "step": 24440 }, { "epoch": 3.640899612749479, "grad_norm": 0.29374897480010986, "learning_rate": 4.010975941771766e-05, "loss": 0.8301, "num_input_tokens_seen": 14095600, "step": 24445 }, { "epoch": 3.641644325290438, "grad_norm": 0.2941935956478119, "learning_rate": 4.0104581355035015e-05, "loss": 0.7816, "num_input_tokens_seen": 14098544, "step": 24450 }, { "epoch": 3.6423890378313972, "grad_norm": 0.27679476141929626, "learning_rate": 4.0099402271646166e-05, "loss": 0.8254, "num_input_tokens_seen": 14101552, "step": 24455 }, { "epoch": 3.6431337503723564, "grad_norm": 0.24727270007133484, "learning_rate": 4.009422216790111e-05, "loss": 0.8106, "num_input_tokens_seen": 14104656, "step": 24460 }, { "epoch": 3.6438784629133156, "grad_norm": 0.20708705484867096, "learning_rate": 4.008904104414988e-05, "loss": 0.7816, "num_input_tokens_seen": 14107504, "step": 24465 }, { "epoch": 3.6446231754542744, "grad_norm": 0.2573678195476532, "learning_rate": 4.0083858900742604e-05, "loss": 0.799, "num_input_tokens_seen": 14110384, "step": 24470 }, { "epoch": 3.645367887995234, "grad_norm": 0.20451894402503967, "learning_rate": 4.007867573802947e-05, "loss": 0.8339, "num_input_tokens_seen": 14113264, "step": 24475 }, { "epoch": 3.646112600536193, "grad_norm": 0.2458501011133194, "learning_rate": 4.007349155636074e-05, "loss": 0.8078, "num_input_tokens_seen": 14115984, "step": 24480 }, { "epoch": 3.6468573130771524, "grad_norm": 0.36462900042533875, "learning_rate": 4.006830635608673e-05, "loss": 0.8099, "num_input_tokens_seen": 14118960, "step": 24485 }, { "epoch": 3.647602025618111, "grad_norm": 0.2760140895843506, "learning_rate": 4.006312013755784e-05, "loss": 0.807, "num_input_tokens_seen": 14122160, "step": 24490 }, { "epoch": 3.648346738159071, "grad_norm": 0.2319212555885315, "learning_rate": 4.005793290112454e-05, "loss": 0.7982, "num_input_tokens_seen": 14124688, "step": 24495 }, { "epoch": 3.6490914507000296, "grad_norm": 0.254355251789093, "learning_rate": 4.005274464713735e-05, "loss": 0.8506, "num_input_tokens_seen": 14127504, "step": 24500 }, { "epoch": 3.6498361632409893, "grad_norm": 0.33702170848846436, "learning_rate": 4.0047555375946876e-05, "loss": 0.8221, "num_input_tokens_seen": 14130352, "step": 24505 }, { "epoch": 3.650580875781948, "grad_norm": 0.2727579176425934, "learning_rate": 4.004236508790379e-05, "loss": 0.8186, "num_input_tokens_seen": 14133552, "step": 24510 }, { "epoch": 3.651325588322907, "grad_norm": 0.23324964940547943, "learning_rate": 4.003717378335883e-05, "loss": 0.8005, "num_input_tokens_seen": 14136240, "step": 24515 }, { "epoch": 3.6520703008638664, "grad_norm": 0.23500578105449677, "learning_rate": 4.0031981462662806e-05, "loss": 0.8189, "num_input_tokens_seen": 14139280, "step": 24520 }, { "epoch": 3.6528150134048256, "grad_norm": 0.2618218660354614, "learning_rate": 4.002678812616658e-05, "loss": 0.7954, "num_input_tokens_seen": 14142160, "step": 24525 }, { "epoch": 3.653559725945785, "grad_norm": 0.17544908821582794, "learning_rate": 4.002159377422111e-05, "loss": 0.7919, "num_input_tokens_seen": 14145104, "step": 24530 }, { "epoch": 3.654304438486744, "grad_norm": 0.2888043224811554, "learning_rate": 4.001639840717741e-05, "loss": 0.8056, "num_input_tokens_seen": 14148016, "step": 24535 }, { "epoch": 3.6550491510277032, "grad_norm": 0.2001264989376068, "learning_rate": 4.001120202538656e-05, "loss": 0.7946, "num_input_tokens_seen": 14150768, "step": 24540 }, { "epoch": 3.6557938635686624, "grad_norm": 0.3158332109451294, "learning_rate": 4.000600462919971e-05, "loss": 0.8065, "num_input_tokens_seen": 14153648, "step": 24545 }, { "epoch": 3.6565385761096216, "grad_norm": 0.2205996960401535, "learning_rate": 4.000080621896807e-05, "loss": 0.7974, "num_input_tokens_seen": 14156688, "step": 24550 }, { "epoch": 3.657283288650581, "grad_norm": 0.2826334238052368, "learning_rate": 3.9995606795042936e-05, "loss": 0.7852, "num_input_tokens_seen": 14159504, "step": 24555 }, { "epoch": 3.65802800119154, "grad_norm": 0.21118603646755219, "learning_rate": 3.9990406357775664e-05, "loss": 0.8048, "num_input_tokens_seen": 14162544, "step": 24560 }, { "epoch": 3.6587727137324992, "grad_norm": 0.3237203061580658, "learning_rate": 3.998520490751767e-05, "loss": 0.7892, "num_input_tokens_seen": 14165520, "step": 24565 }, { "epoch": 3.6595174262734584, "grad_norm": 0.25041595101356506, "learning_rate": 3.998000244462046e-05, "loss": 0.803, "num_input_tokens_seen": 14168272, "step": 24570 }, { "epoch": 3.6602621388144176, "grad_norm": 0.35221314430236816, "learning_rate": 3.997479896943559e-05, "loss": 0.7831, "num_input_tokens_seen": 14171312, "step": 24575 }, { "epoch": 3.661006851355377, "grad_norm": 0.3757386803627014, "learning_rate": 3.996959448231469e-05, "loss": 0.8083, "num_input_tokens_seen": 14174160, "step": 24580 }, { "epoch": 3.661751563896336, "grad_norm": 0.3060723543167114, "learning_rate": 3.9964388983609455e-05, "loss": 0.8184, "num_input_tokens_seen": 14177104, "step": 24585 }, { "epoch": 3.6624962764372953, "grad_norm": 0.2840501070022583, "learning_rate": 3.995918247367165e-05, "loss": 0.8243, "num_input_tokens_seen": 14179888, "step": 24590 }, { "epoch": 3.6632409889782545, "grad_norm": 0.37075182795524597, "learning_rate": 3.9953974952853125e-05, "loss": 0.7814, "num_input_tokens_seen": 14182544, "step": 24595 }, { "epoch": 3.6639857015192137, "grad_norm": 0.2472815215587616, "learning_rate": 3.994876642150576e-05, "loss": 0.8052, "num_input_tokens_seen": 14185264, "step": 24600 }, { "epoch": 3.664730414060173, "grad_norm": 0.2955203056335449, "learning_rate": 3.9943556879981534e-05, "loss": 0.7905, "num_input_tokens_seen": 14188272, "step": 24605 }, { "epoch": 3.665475126601132, "grad_norm": 0.35346677899360657, "learning_rate": 3.993834632863249e-05, "loss": 0.8144, "num_input_tokens_seen": 14191440, "step": 24610 }, { "epoch": 3.6662198391420913, "grad_norm": 0.2970695495605469, "learning_rate": 3.993313476781075e-05, "loss": 0.8529, "num_input_tokens_seen": 14194256, "step": 24615 }, { "epoch": 3.6669645516830505, "grad_norm": 0.2772374749183655, "learning_rate": 3.992792219786847e-05, "loss": 0.7925, "num_input_tokens_seen": 14197296, "step": 24620 }, { "epoch": 3.6677092642240097, "grad_norm": 0.2433733493089676, "learning_rate": 3.9922708619157894e-05, "loss": 0.8248, "num_input_tokens_seen": 14200176, "step": 24625 }, { "epoch": 3.668453976764969, "grad_norm": 0.27681729197502136, "learning_rate": 3.9917494032031346e-05, "loss": 0.7676, "num_input_tokens_seen": 14203088, "step": 24630 }, { "epoch": 3.669198689305928, "grad_norm": 0.2072458416223526, "learning_rate": 3.99122784368412e-05, "loss": 0.8043, "num_input_tokens_seen": 14206000, "step": 24635 }, { "epoch": 3.6699434018468873, "grad_norm": 0.26319384574890137, "learning_rate": 3.990706183393991e-05, "loss": 0.8104, "num_input_tokens_seen": 14209232, "step": 24640 }, { "epoch": 3.670688114387846, "grad_norm": 0.2641836702823639, "learning_rate": 3.990184422367998e-05, "loss": 0.8046, "num_input_tokens_seen": 14211792, "step": 24645 }, { "epoch": 3.6714328269288057, "grad_norm": 0.3055122196674347, "learning_rate": 3.989662560641401e-05, "loss": 0.7918, "num_input_tokens_seen": 14214640, "step": 24650 }, { "epoch": 3.6721775394697644, "grad_norm": 0.2937922179698944, "learning_rate": 3.9891405982494647e-05, "loss": 0.7669, "num_input_tokens_seen": 14217904, "step": 24655 }, { "epoch": 3.672922252010724, "grad_norm": 0.2629525363445282, "learning_rate": 3.988618535227461e-05, "loss": 0.7983, "num_input_tokens_seen": 14221136, "step": 24660 }, { "epoch": 3.673666964551683, "grad_norm": 0.258541464805603, "learning_rate": 3.988096371610669e-05, "loss": 0.7844, "num_input_tokens_seen": 14224176, "step": 24665 }, { "epoch": 3.6744116770926425, "grad_norm": 0.35367974638938904, "learning_rate": 3.9875741074343744e-05, "loss": 0.8158, "num_input_tokens_seen": 14227152, "step": 24670 }, { "epoch": 3.6751563896336013, "grad_norm": 0.29916897416114807, "learning_rate": 3.98705174273387e-05, "loss": 0.8172, "num_input_tokens_seen": 14229776, "step": 24675 }, { "epoch": 3.675901102174561, "grad_norm": 0.323102742433548, "learning_rate": 3.986529277544454e-05, "loss": 0.8007, "num_input_tokens_seen": 14232720, "step": 24680 }, { "epoch": 3.6766458147155197, "grad_norm": 0.3135926425457001, "learning_rate": 3.9860067119014334e-05, "loss": 0.7856, "num_input_tokens_seen": 14235536, "step": 24685 }, { "epoch": 3.677390527256479, "grad_norm": 0.2738969624042511, "learning_rate": 3.985484045840121e-05, "loss": 0.83, "num_input_tokens_seen": 14238448, "step": 24690 }, { "epoch": 3.678135239797438, "grad_norm": 0.34690746665000916, "learning_rate": 3.984961279395836e-05, "loss": 0.8015, "num_input_tokens_seen": 14241168, "step": 24695 }, { "epoch": 3.6788799523383973, "grad_norm": 0.25529026985168457, "learning_rate": 3.9844384126039055e-05, "loss": 0.8174, "num_input_tokens_seen": 14244080, "step": 24700 }, { "epoch": 3.6796246648793565, "grad_norm": 0.2796896994113922, "learning_rate": 3.983915445499663e-05, "loss": 0.8035, "num_input_tokens_seen": 14247088, "step": 24705 }, { "epoch": 3.6803693774203157, "grad_norm": 0.31232988834381104, "learning_rate": 3.983392378118447e-05, "loss": 0.7702, "num_input_tokens_seen": 14250288, "step": 24710 }, { "epoch": 3.681114089961275, "grad_norm": 0.2716456353664398, "learning_rate": 3.9828692104956054e-05, "loss": 0.7999, "num_input_tokens_seen": 14253872, "step": 24715 }, { "epoch": 3.681858802502234, "grad_norm": 0.25662294030189514, "learning_rate": 3.982345942666492e-05, "loss": 0.7954, "num_input_tokens_seen": 14256688, "step": 24720 }, { "epoch": 3.6826035150431933, "grad_norm": 0.20584514737129211, "learning_rate": 3.981822574666466e-05, "loss": 0.8245, "num_input_tokens_seen": 14259280, "step": 24725 }, { "epoch": 3.6833482275841525, "grad_norm": 0.2873000204563141, "learning_rate": 3.9812991065308946e-05, "loss": 0.8153, "num_input_tokens_seen": 14261872, "step": 24730 }, { "epoch": 3.6840929401251117, "grad_norm": 0.23320244252681732, "learning_rate": 3.980775538295153e-05, "loss": 0.8183, "num_input_tokens_seen": 14264752, "step": 24735 }, { "epoch": 3.684837652666071, "grad_norm": 0.20859740674495697, "learning_rate": 3.98025186999462e-05, "loss": 0.8114, "num_input_tokens_seen": 14267792, "step": 24740 }, { "epoch": 3.68558236520703, "grad_norm": 0.3038598597049713, "learning_rate": 3.979728101664685e-05, "loss": 0.8022, "num_input_tokens_seen": 14270544, "step": 24745 }, { "epoch": 3.6863270777479893, "grad_norm": 0.26257067918777466, "learning_rate": 3.9792042333407404e-05, "loss": 0.7992, "num_input_tokens_seen": 14273488, "step": 24750 }, { "epoch": 3.6870717902889485, "grad_norm": 0.23517842590808868, "learning_rate": 3.978680265058187e-05, "loss": 0.8462, "num_input_tokens_seen": 14276560, "step": 24755 }, { "epoch": 3.6878165028299077, "grad_norm": 0.2222248911857605, "learning_rate": 3.978156196852435e-05, "loss": 0.7836, "num_input_tokens_seen": 14279472, "step": 24760 }, { "epoch": 3.688561215370867, "grad_norm": 0.2358660250902176, "learning_rate": 3.977632028758895e-05, "loss": 0.7968, "num_input_tokens_seen": 14282416, "step": 24765 }, { "epoch": 3.689305927911826, "grad_norm": 0.30241167545318604, "learning_rate": 3.977107760812991e-05, "loss": 0.813, "num_input_tokens_seen": 14285264, "step": 24770 }, { "epoch": 3.6900506404527853, "grad_norm": 0.27119553089141846, "learning_rate": 3.976583393050151e-05, "loss": 0.7908, "num_input_tokens_seen": 14288240, "step": 24775 }, { "epoch": 3.6907953529937445, "grad_norm": 0.21642713248729706, "learning_rate": 3.976058925505807e-05, "loss": 0.7974, "num_input_tokens_seen": 14291088, "step": 24780 }, { "epoch": 3.6915400655347037, "grad_norm": 0.2101258486509323, "learning_rate": 3.975534358215403e-05, "loss": 0.8389, "num_input_tokens_seen": 14294384, "step": 24785 }, { "epoch": 3.692284778075663, "grad_norm": 0.2626270651817322, "learning_rate": 3.9750096912143855e-05, "loss": 0.7802, "num_input_tokens_seen": 14297232, "step": 24790 }, { "epoch": 3.693029490616622, "grad_norm": 0.23030054569244385, "learning_rate": 3.97448492453821e-05, "loss": 0.8073, "num_input_tokens_seen": 14300336, "step": 24795 }, { "epoch": 3.6937742031575813, "grad_norm": 0.22478261590003967, "learning_rate": 3.973960058222339e-05, "loss": 0.7857, "num_input_tokens_seen": 14303088, "step": 24800 }, { "epoch": 3.6945189156985405, "grad_norm": 0.23668742179870605, "learning_rate": 3.973435092302239e-05, "loss": 0.7812, "num_input_tokens_seen": 14305744, "step": 24805 }, { "epoch": 3.6952636282394993, "grad_norm": 0.2631029784679413, "learning_rate": 3.972910026813387e-05, "loss": 0.8081, "num_input_tokens_seen": 14308912, "step": 24810 }, { "epoch": 3.696008340780459, "grad_norm": 0.2357921451330185, "learning_rate": 3.972384861791263e-05, "loss": 0.8077, "num_input_tokens_seen": 14311792, "step": 24815 }, { "epoch": 3.6967530533214177, "grad_norm": 0.24583914875984192, "learning_rate": 3.971859597271357e-05, "loss": 0.8034, "num_input_tokens_seen": 14314416, "step": 24820 }, { "epoch": 3.6974977658623773, "grad_norm": 0.3025071918964386, "learning_rate": 3.9713342332891625e-05, "loss": 0.7874, "num_input_tokens_seen": 14317328, "step": 24825 }, { "epoch": 3.698242478403336, "grad_norm": 0.3675002455711365, "learning_rate": 3.9708087698801834e-05, "loss": 0.8028, "num_input_tokens_seen": 14320144, "step": 24830 }, { "epoch": 3.6989871909442957, "grad_norm": 0.23390959203243256, "learning_rate": 3.9702832070799265e-05, "loss": 0.8119, "num_input_tokens_seen": 14323024, "step": 24835 }, { "epoch": 3.6997319034852545, "grad_norm": 0.20797237753868103, "learning_rate": 3.969757544923909e-05, "loss": 0.7947, "num_input_tokens_seen": 14325520, "step": 24840 }, { "epoch": 3.700476616026214, "grad_norm": 0.27456358075141907, "learning_rate": 3.969231783447652e-05, "loss": 0.7969, "num_input_tokens_seen": 14328624, "step": 24845 }, { "epoch": 3.701221328567173, "grad_norm": 0.24567091464996338, "learning_rate": 3.9687059226866854e-05, "loss": 0.7783, "num_input_tokens_seen": 14331312, "step": 24850 }, { "epoch": 3.701966041108132, "grad_norm": 0.19174812734127045, "learning_rate": 3.9681799626765425e-05, "loss": 0.8257, "num_input_tokens_seen": 14334256, "step": 24855 }, { "epoch": 3.7027107536490913, "grad_norm": 0.25913193821907043, "learning_rate": 3.9676539034527684e-05, "loss": 0.8082, "num_input_tokens_seen": 14337552, "step": 24860 }, { "epoch": 3.7034554661900505, "grad_norm": 0.2691771686077118, "learning_rate": 3.9671277450509094e-05, "loss": 0.7789, "num_input_tokens_seen": 14340432, "step": 24865 }, { "epoch": 3.7042001787310097, "grad_norm": 0.23958757519721985, "learning_rate": 3.9666014875065226e-05, "loss": 0.7909, "num_input_tokens_seen": 14343504, "step": 24870 }, { "epoch": 3.704944891271969, "grad_norm": 0.22937077283859253, "learning_rate": 3.9660751308551705e-05, "loss": 0.834, "num_input_tokens_seen": 14346224, "step": 24875 }, { "epoch": 3.705689603812928, "grad_norm": 0.2826933264732361, "learning_rate": 3.965548675132421e-05, "loss": 0.7927, "num_input_tokens_seen": 14349520, "step": 24880 }, { "epoch": 3.7064343163538873, "grad_norm": 0.31859296560287476, "learning_rate": 3.96502212037385e-05, "loss": 0.8436, "num_input_tokens_seen": 14352624, "step": 24885 }, { "epoch": 3.7071790288948465, "grad_norm": 0.25311172008514404, "learning_rate": 3.964495466615042e-05, "loss": 0.8376, "num_input_tokens_seen": 14355344, "step": 24890 }, { "epoch": 3.7079237414358057, "grad_norm": 0.26977983117103577, "learning_rate": 3.963968713891584e-05, "loss": 0.7882, "num_input_tokens_seen": 14358160, "step": 24895 }, { "epoch": 3.708668453976765, "grad_norm": 0.2197708636522293, "learning_rate": 3.9634418622390727e-05, "loss": 0.8277, "num_input_tokens_seen": 14361008, "step": 24900 }, { "epoch": 3.709413166517724, "grad_norm": 0.24273361265659332, "learning_rate": 3.9629149116931086e-05, "loss": 0.7744, "num_input_tokens_seen": 14363824, "step": 24905 }, { "epoch": 3.7101578790586833, "grad_norm": 0.25423672795295715, "learning_rate": 3.962387862289304e-05, "loss": 0.7961, "num_input_tokens_seen": 14366928, "step": 24910 }, { "epoch": 3.7109025915996425, "grad_norm": 0.24608875811100006, "learning_rate": 3.9618607140632724e-05, "loss": 0.8279, "num_input_tokens_seen": 14369776, "step": 24915 }, { "epoch": 3.7116473041406017, "grad_norm": 0.2156422734260559, "learning_rate": 3.9613334670506384e-05, "loss": 0.818, "num_input_tokens_seen": 14372688, "step": 24920 }, { "epoch": 3.712392016681561, "grad_norm": 0.2546219229698181, "learning_rate": 3.9608061212870294e-05, "loss": 0.8296, "num_input_tokens_seen": 14375568, "step": 24925 }, { "epoch": 3.71313672922252, "grad_norm": 0.25454822182655334, "learning_rate": 3.960278676808082e-05, "loss": 0.7859, "num_input_tokens_seen": 14378224, "step": 24930 }, { "epoch": 3.7138814417634793, "grad_norm": 0.22319796681404114, "learning_rate": 3.959751133649439e-05, "loss": 0.8022, "num_input_tokens_seen": 14380944, "step": 24935 }, { "epoch": 3.7146261543044385, "grad_norm": 0.40421727299690247, "learning_rate": 3.959223491846749e-05, "loss": 0.7765, "num_input_tokens_seen": 14383888, "step": 24940 }, { "epoch": 3.7153708668453977, "grad_norm": 0.22210019826889038, "learning_rate": 3.958695751435668e-05, "loss": 0.8123, "num_input_tokens_seen": 14386928, "step": 24945 }, { "epoch": 3.716115579386357, "grad_norm": 0.20301736891269684, "learning_rate": 3.958167912451859e-05, "loss": 0.7582, "num_input_tokens_seen": 14389680, "step": 24950 }, { "epoch": 3.716860291927316, "grad_norm": 0.2412974238395691, "learning_rate": 3.95763997493099e-05, "loss": 0.8102, "num_input_tokens_seen": 14392656, "step": 24955 }, { "epoch": 3.7176050044682754, "grad_norm": 0.37212008237838745, "learning_rate": 3.95711193890874e-05, "loss": 0.8282, "num_input_tokens_seen": 14395408, "step": 24960 }, { "epoch": 3.7183497170092346, "grad_norm": 0.28476646542549133, "learning_rate": 3.956583804420787e-05, "loss": 0.8678, "num_input_tokens_seen": 14398640, "step": 24965 }, { "epoch": 3.7190944295501938, "grad_norm": 0.3511267304420471, "learning_rate": 3.9560555715028235e-05, "loss": 0.7987, "num_input_tokens_seen": 14401936, "step": 24970 }, { "epoch": 3.719839142091153, "grad_norm": 0.2123168557882309, "learning_rate": 3.9555272401905445e-05, "loss": 0.7661, "num_input_tokens_seen": 14404784, "step": 24975 }, { "epoch": 3.720583854632112, "grad_norm": 0.27157190442085266, "learning_rate": 3.9549988105196525e-05, "loss": 0.8408, "num_input_tokens_seen": 14407728, "step": 24980 }, { "epoch": 3.721328567173071, "grad_norm": 0.2653646767139435, "learning_rate": 3.954470282525856e-05, "loss": 0.8474, "num_input_tokens_seen": 14410544, "step": 24985 }, { "epoch": 3.7220732797140306, "grad_norm": 0.26281923055648804, "learning_rate": 3.9539416562448715e-05, "loss": 0.8133, "num_input_tokens_seen": 14413520, "step": 24990 }, { "epoch": 3.7228179922549893, "grad_norm": 0.22314630448818207, "learning_rate": 3.953412931712421e-05, "loss": 0.7892, "num_input_tokens_seen": 14416464, "step": 24995 }, { "epoch": 3.723562704795949, "grad_norm": 0.25495645403862, "learning_rate": 3.952884108964234e-05, "loss": 0.8039, "num_input_tokens_seen": 14419216, "step": 25000 }, { "epoch": 3.7243074173369077, "grad_norm": 0.2573796212673187, "learning_rate": 3.952355188036046e-05, "loss": 0.7854, "num_input_tokens_seen": 14422032, "step": 25005 }, { "epoch": 3.7250521298778674, "grad_norm": 0.2602653205394745, "learning_rate": 3.9518261689635995e-05, "loss": 0.8423, "num_input_tokens_seen": 14424592, "step": 25010 }, { "epoch": 3.725796842418826, "grad_norm": 0.19374889135360718, "learning_rate": 3.951297051782643e-05, "loss": 0.8338, "num_input_tokens_seen": 14427440, "step": 25015 }, { "epoch": 3.726541554959786, "grad_norm": 0.20229968428611755, "learning_rate": 3.9507678365289316e-05, "loss": 0.8003, "num_input_tokens_seen": 14430704, "step": 25020 }, { "epoch": 3.7272862675007445, "grad_norm": 0.2080785483121872, "learning_rate": 3.950238523238229e-05, "loss": 0.7965, "num_input_tokens_seen": 14433616, "step": 25025 }, { "epoch": 3.7280309800417037, "grad_norm": 0.3013324439525604, "learning_rate": 3.949709111946303e-05, "loss": 0.7685, "num_input_tokens_seen": 14436336, "step": 25030 }, { "epoch": 3.728775692582663, "grad_norm": 0.23601804673671722, "learning_rate": 3.949179602688928e-05, "loss": 0.8276, "num_input_tokens_seen": 14439088, "step": 25035 }, { "epoch": 3.729520405123622, "grad_norm": 0.20395871996879578, "learning_rate": 3.9486499955018893e-05, "loss": 0.7861, "num_input_tokens_seen": 14442032, "step": 25040 }, { "epoch": 3.7302651176645814, "grad_norm": 0.30499985814094543, "learning_rate": 3.948120290420973e-05, "loss": 0.8087, "num_input_tokens_seen": 14445296, "step": 25045 }, { "epoch": 3.7310098302055406, "grad_norm": 0.2512540817260742, "learning_rate": 3.947590487481975e-05, "loss": 0.8174, "num_input_tokens_seen": 14447952, "step": 25050 }, { "epoch": 3.7317545427464998, "grad_norm": 0.2795899212360382, "learning_rate": 3.9470605867206976e-05, "loss": 0.8184, "num_input_tokens_seen": 14450672, "step": 25055 }, { "epoch": 3.732499255287459, "grad_norm": 0.19437070190906525, "learning_rate": 3.946530588172949e-05, "loss": 0.7889, "num_input_tokens_seen": 14453744, "step": 25060 }, { "epoch": 3.733243967828418, "grad_norm": 0.2537388205528259, "learning_rate": 3.946000491874544e-05, "loss": 0.817, "num_input_tokens_seen": 14456592, "step": 25065 }, { "epoch": 3.7339886803693774, "grad_norm": 0.23895405232906342, "learning_rate": 3.945470297861305e-05, "loss": 0.8062, "num_input_tokens_seen": 14459472, "step": 25070 }, { "epoch": 3.7347333929103366, "grad_norm": 0.2824149429798126, "learning_rate": 3.94494000616906e-05, "loss": 0.7787, "num_input_tokens_seen": 14462352, "step": 25075 }, { "epoch": 3.7354781054512958, "grad_norm": 0.27152585983276367, "learning_rate": 3.944409616833645e-05, "loss": 0.7882, "num_input_tokens_seen": 14465264, "step": 25080 }, { "epoch": 3.736222817992255, "grad_norm": 0.44671082496643066, "learning_rate": 3.9438791298909e-05, "loss": 0.8338, "num_input_tokens_seen": 14468176, "step": 25085 }, { "epoch": 3.736967530533214, "grad_norm": 0.2394907921552658, "learning_rate": 3.943348545376673e-05, "loss": 0.8287, "num_input_tokens_seen": 14471216, "step": 25090 }, { "epoch": 3.7377122430741734, "grad_norm": 0.3137868642807007, "learning_rate": 3.94281786332682e-05, "loss": 0.8166, "num_input_tokens_seen": 14474064, "step": 25095 }, { "epoch": 3.7384569556151326, "grad_norm": 0.21046946942806244, "learning_rate": 3.942287083777203e-05, "loss": 0.8152, "num_input_tokens_seen": 14476912, "step": 25100 }, { "epoch": 3.739201668156092, "grad_norm": 0.3453374207019806, "learning_rate": 3.941756206763687e-05, "loss": 0.8188, "num_input_tokens_seen": 14479824, "step": 25105 }, { "epoch": 3.739946380697051, "grad_norm": 0.2880658507347107, "learning_rate": 3.9412252323221495e-05, "loss": 0.8216, "num_input_tokens_seen": 14482672, "step": 25110 }, { "epoch": 3.74069109323801, "grad_norm": 0.21367508172988892, "learning_rate": 3.94069416048847e-05, "loss": 0.7637, "num_input_tokens_seen": 14485264, "step": 25115 }, { "epoch": 3.7414358057789694, "grad_norm": 0.21681465208530426, "learning_rate": 3.940162991298537e-05, "loss": 0.8055, "num_input_tokens_seen": 14488016, "step": 25120 }, { "epoch": 3.7421805183199286, "grad_norm": 0.22615307569503784, "learning_rate": 3.9396317247882444e-05, "loss": 0.8217, "num_input_tokens_seen": 14490768, "step": 25125 }, { "epoch": 3.742925230860888, "grad_norm": 0.3356560170650482, "learning_rate": 3.939100360993492e-05, "loss": 0.8317, "num_input_tokens_seen": 14493488, "step": 25130 }, { "epoch": 3.743669943401847, "grad_norm": 0.3027642071247101, "learning_rate": 3.938568899950188e-05, "loss": 0.8061, "num_input_tokens_seen": 14496432, "step": 25135 }, { "epoch": 3.744414655942806, "grad_norm": 0.2192137986421585, "learning_rate": 3.9380373416942474e-05, "loss": 0.8094, "num_input_tokens_seen": 14499504, "step": 25140 }, { "epoch": 3.7451593684837654, "grad_norm": 0.29169321060180664, "learning_rate": 3.93750568626159e-05, "loss": 0.7975, "num_input_tokens_seen": 14502096, "step": 25145 }, { "epoch": 3.7459040810247246, "grad_norm": 0.21122993528842926, "learning_rate": 3.9369739336881426e-05, "loss": 0.776, "num_input_tokens_seen": 14504912, "step": 25150 }, { "epoch": 3.746648793565684, "grad_norm": 0.23302771151065826, "learning_rate": 3.936442084009839e-05, "loss": 0.8126, "num_input_tokens_seen": 14507664, "step": 25155 }, { "epoch": 3.7473935061066426, "grad_norm": 0.2579331398010254, "learning_rate": 3.9359101372626195e-05, "loss": 0.8275, "num_input_tokens_seen": 14510704, "step": 25160 }, { "epoch": 3.748138218647602, "grad_norm": 0.3119945526123047, "learning_rate": 3.935378093482431e-05, "loss": 0.8143, "num_input_tokens_seen": 14513584, "step": 25165 }, { "epoch": 3.748882931188561, "grad_norm": 0.2285015732049942, "learning_rate": 3.9348459527052264e-05, "loss": 0.7936, "num_input_tokens_seen": 14516528, "step": 25170 }, { "epoch": 3.7496276437295206, "grad_norm": 0.2950437068939209, "learning_rate": 3.9343137149669665e-05, "loss": 0.8119, "num_input_tokens_seen": 14519696, "step": 25175 }, { "epoch": 3.7503723562704794, "grad_norm": 0.228111132979393, "learning_rate": 3.933781380303617e-05, "loss": 0.7898, "num_input_tokens_seen": 14522800, "step": 25180 }, { "epoch": 3.751117068811439, "grad_norm": 0.19729554653167725, "learning_rate": 3.933248948751151e-05, "loss": 0.7875, "num_input_tokens_seen": 14525712, "step": 25185 }, { "epoch": 3.751861781352398, "grad_norm": 0.2407304048538208, "learning_rate": 3.932716420345548e-05, "loss": 0.7577, "num_input_tokens_seen": 14528688, "step": 25190 }, { "epoch": 3.7526064938933574, "grad_norm": 0.27444082498550415, "learning_rate": 3.932183795122795e-05, "loss": 0.8215, "num_input_tokens_seen": 14532016, "step": 25195 }, { "epoch": 3.753351206434316, "grad_norm": 0.20926286280155182, "learning_rate": 3.931651073118884e-05, "loss": 0.771, "num_input_tokens_seen": 14535152, "step": 25200 }, { "epoch": 3.7540959189752754, "grad_norm": 0.23365561664104462, "learning_rate": 3.931118254369813e-05, "loss": 0.7948, "num_input_tokens_seen": 14537968, "step": 25205 }, { "epoch": 3.7548406315162346, "grad_norm": 0.23881258070468903, "learning_rate": 3.93058533891159e-05, "loss": 0.7996, "num_input_tokens_seen": 14540688, "step": 25210 }, { "epoch": 3.755585344057194, "grad_norm": 0.26120585203170776, "learning_rate": 3.930052326780225e-05, "loss": 0.7679, "num_input_tokens_seen": 14543440, "step": 25215 }, { "epoch": 3.756330056598153, "grad_norm": 0.27066144347190857, "learning_rate": 3.929519218011739e-05, "loss": 0.8469, "num_input_tokens_seen": 14546352, "step": 25220 }, { "epoch": 3.757074769139112, "grad_norm": 0.25913920998573303, "learning_rate": 3.928986012642156e-05, "loss": 0.812, "num_input_tokens_seen": 14549520, "step": 25225 }, { "epoch": 3.7578194816800714, "grad_norm": 0.28552916646003723, "learning_rate": 3.9284527107075075e-05, "loss": 0.7628, "num_input_tokens_seen": 14552592, "step": 25230 }, { "epoch": 3.7585641942210306, "grad_norm": 0.27916446328163147, "learning_rate": 3.927919312243833e-05, "loss": 0.7868, "num_input_tokens_seen": 14555984, "step": 25235 }, { "epoch": 3.75930890676199, "grad_norm": 0.3012371063232422, "learning_rate": 3.927385817287177e-05, "loss": 0.8056, "num_input_tokens_seen": 14558800, "step": 25240 }, { "epoch": 3.760053619302949, "grad_norm": 0.2691180408000946, "learning_rate": 3.926852225873591e-05, "loss": 0.7903, "num_input_tokens_seen": 14561584, "step": 25245 }, { "epoch": 3.760798331843908, "grad_norm": 0.17847464978694916, "learning_rate": 3.926318538039132e-05, "loss": 0.8143, "num_input_tokens_seen": 14564656, "step": 25250 }, { "epoch": 3.7615430443848674, "grad_norm": 0.2380862683057785, "learning_rate": 3.9257847538198654e-05, "loss": 0.8196, "num_input_tokens_seen": 14567632, "step": 25255 }, { "epoch": 3.7622877569258266, "grad_norm": 0.2323116809129715, "learning_rate": 3.9252508732518625e-05, "loss": 0.8067, "num_input_tokens_seen": 14570544, "step": 25260 }, { "epoch": 3.763032469466786, "grad_norm": 0.16431812942028046, "learning_rate": 3.9247168963712e-05, "loss": 0.7897, "num_input_tokens_seen": 14573520, "step": 25265 }, { "epoch": 3.763777182007745, "grad_norm": 0.2249077558517456, "learning_rate": 3.924182823213962e-05, "loss": 0.7719, "num_input_tokens_seen": 14576496, "step": 25270 }, { "epoch": 3.7645218945487042, "grad_norm": 0.22336916625499725, "learning_rate": 3.923648653816239e-05, "loss": 0.7995, "num_input_tokens_seen": 14579120, "step": 25275 }, { "epoch": 3.7652666070896634, "grad_norm": 0.18154753744602203, "learning_rate": 3.923114388214128e-05, "loss": 0.8238, "num_input_tokens_seen": 14582000, "step": 25280 }, { "epoch": 3.7660113196306226, "grad_norm": 0.27352020144462585, "learning_rate": 3.922580026443733e-05, "loss": 0.8135, "num_input_tokens_seen": 14584784, "step": 25285 }, { "epoch": 3.766756032171582, "grad_norm": 0.2285783886909485, "learning_rate": 3.922045568541164e-05, "loss": 0.7817, "num_input_tokens_seen": 14587600, "step": 25290 }, { "epoch": 3.767500744712541, "grad_norm": 0.25956615805625916, "learning_rate": 3.921511014542536e-05, "loss": 0.7765, "num_input_tokens_seen": 14590640, "step": 25295 }, { "epoch": 3.7682454572535002, "grad_norm": 0.27462702989578247, "learning_rate": 3.9209763644839736e-05, "loss": 0.8132, "num_input_tokens_seen": 14593360, "step": 25300 }, { "epoch": 3.7689901697944594, "grad_norm": 0.24397218227386475, "learning_rate": 3.9204416184016055e-05, "loss": 0.816, "num_input_tokens_seen": 14596496, "step": 25305 }, { "epoch": 3.7697348823354186, "grad_norm": 0.28527823090553284, "learning_rate": 3.9199067763315685e-05, "loss": 0.7723, "num_input_tokens_seen": 14599664, "step": 25310 }, { "epoch": 3.770479594876378, "grad_norm": 0.17165540158748627, "learning_rate": 3.919371838310004e-05, "loss": 0.7986, "num_input_tokens_seen": 14602640, "step": 25315 }, { "epoch": 3.771224307417337, "grad_norm": 0.37721604108810425, "learning_rate": 3.9188368043730615e-05, "loss": 0.8266, "num_input_tokens_seen": 14605936, "step": 25320 }, { "epoch": 3.7719690199582963, "grad_norm": 0.28773126006126404, "learning_rate": 3.918301674556897e-05, "loss": 0.7748, "num_input_tokens_seen": 14608592, "step": 25325 }, { "epoch": 3.7727137324992555, "grad_norm": 0.21671262383460999, "learning_rate": 3.917766448897671e-05, "loss": 0.7729, "num_input_tokens_seen": 14611440, "step": 25330 }, { "epoch": 3.773458445040214, "grad_norm": 0.31295251846313477, "learning_rate": 3.917231127431552e-05, "loss": 0.8205, "num_input_tokens_seen": 14614448, "step": 25335 }, { "epoch": 3.774203157581174, "grad_norm": 0.20724396407604218, "learning_rate": 3.9166957101947166e-05, "loss": 0.8134, "num_input_tokens_seen": 14617328, "step": 25340 }, { "epoch": 3.7749478701221326, "grad_norm": 0.24004125595092773, "learning_rate": 3.916160197223344e-05, "loss": 0.7599, "num_input_tokens_seen": 14620400, "step": 25345 }, { "epoch": 3.7756925826630923, "grad_norm": 0.2217119038105011, "learning_rate": 3.915624588553624e-05, "loss": 0.8165, "num_input_tokens_seen": 14622960, "step": 25350 }, { "epoch": 3.776437295204051, "grad_norm": 0.23519454896450043, "learning_rate": 3.915088884221749e-05, "loss": 0.7962, "num_input_tokens_seen": 14625648, "step": 25355 }, { "epoch": 3.7771820077450107, "grad_norm": 0.23739966750144958, "learning_rate": 3.914553084263921e-05, "loss": 0.7882, "num_input_tokens_seen": 14628208, "step": 25360 }, { "epoch": 3.7779267202859694, "grad_norm": 0.26394903659820557, "learning_rate": 3.914017188716347e-05, "loss": 0.8107, "num_input_tokens_seen": 14631088, "step": 25365 }, { "epoch": 3.778671432826929, "grad_norm": 0.27540111541748047, "learning_rate": 3.9134811976152393e-05, "loss": 0.782, "num_input_tokens_seen": 14633872, "step": 25370 }, { "epoch": 3.779416145367888, "grad_norm": 0.22485850751399994, "learning_rate": 3.91294511099682e-05, "loss": 0.8533, "num_input_tokens_seen": 14636528, "step": 25375 }, { "epoch": 3.780160857908847, "grad_norm": 0.273431658744812, "learning_rate": 3.912408928897314e-05, "loss": 0.7973, "num_input_tokens_seen": 14639248, "step": 25380 }, { "epoch": 3.7809055704498062, "grad_norm": 0.2907950282096863, "learning_rate": 3.911872651352956e-05, "loss": 0.823, "num_input_tokens_seen": 14642288, "step": 25385 }, { "epoch": 3.7816502829907654, "grad_norm": 0.2449917048215866, "learning_rate": 3.911336278399984e-05, "loss": 0.8066, "num_input_tokens_seen": 14645328, "step": 25390 }, { "epoch": 3.7823949955317246, "grad_norm": 0.29121890664100647, "learning_rate": 3.9107998100746444e-05, "loss": 0.8242, "num_input_tokens_seen": 14648208, "step": 25395 }, { "epoch": 3.783139708072684, "grad_norm": 0.24288418889045715, "learning_rate": 3.9102632464131895e-05, "loss": 0.8204, "num_input_tokens_seen": 14650864, "step": 25400 }, { "epoch": 3.783884420613643, "grad_norm": 0.29369476437568665, "learning_rate": 3.909726587451878e-05, "loss": 0.7857, "num_input_tokens_seen": 14653616, "step": 25405 }, { "epoch": 3.7846291331546023, "grad_norm": 0.25340038537979126, "learning_rate": 3.9091898332269746e-05, "loss": 0.8142, "num_input_tokens_seen": 14656848, "step": 25410 }, { "epoch": 3.7853738456955615, "grad_norm": 0.28716641664505005, "learning_rate": 3.908652983774753e-05, "loss": 0.8062, "num_input_tokens_seen": 14659632, "step": 25415 }, { "epoch": 3.7861185582365207, "grad_norm": 0.31253498792648315, "learning_rate": 3.908116039131489e-05, "loss": 0.8193, "num_input_tokens_seen": 14662448, "step": 25420 }, { "epoch": 3.78686327077748, "grad_norm": 0.2241918444633484, "learning_rate": 3.9075789993334686e-05, "loss": 0.8094, "num_input_tokens_seen": 14665168, "step": 25425 }, { "epoch": 3.787607983318439, "grad_norm": 0.24528756737709045, "learning_rate": 3.907041864416982e-05, "loss": 0.7984, "num_input_tokens_seen": 14668112, "step": 25430 }, { "epoch": 3.7883526958593983, "grad_norm": 0.23316262662410736, "learning_rate": 3.9065046344183265e-05, "loss": 0.8038, "num_input_tokens_seen": 14670832, "step": 25435 }, { "epoch": 3.7890974084003575, "grad_norm": 0.23635762929916382, "learning_rate": 3.905967309373806e-05, "loss": 0.7937, "num_input_tokens_seen": 14673552, "step": 25440 }, { "epoch": 3.7898421209413167, "grad_norm": 0.2350984513759613, "learning_rate": 3.905429889319732e-05, "loss": 0.7827, "num_input_tokens_seen": 14676304, "step": 25445 }, { "epoch": 3.790586833482276, "grad_norm": 0.1819790154695511, "learning_rate": 3.904892374292419e-05, "loss": 0.8319, "num_input_tokens_seen": 14679280, "step": 25450 }, { "epoch": 3.791331546023235, "grad_norm": 0.22527560591697693, "learning_rate": 3.904354764328192e-05, "loss": 0.7868, "num_input_tokens_seen": 14682448, "step": 25455 }, { "epoch": 3.7920762585641943, "grad_norm": 0.2989875376224518, "learning_rate": 3.903817059463379e-05, "loss": 0.8283, "num_input_tokens_seen": 14685296, "step": 25460 }, { "epoch": 3.7928209711051535, "grad_norm": 0.32150423526763916, "learning_rate": 3.903279259734318e-05, "loss": 0.7792, "num_input_tokens_seen": 14688208, "step": 25465 }, { "epoch": 3.7935656836461127, "grad_norm": 0.2759700119495392, "learning_rate": 3.902741365177349e-05, "loss": 0.7884, "num_input_tokens_seen": 14691184, "step": 25470 }, { "epoch": 3.794310396187072, "grad_norm": 0.3636914789676666, "learning_rate": 3.902203375828822e-05, "loss": 0.8017, "num_input_tokens_seen": 14694192, "step": 25475 }, { "epoch": 3.795055108728031, "grad_norm": 0.26113441586494446, "learning_rate": 3.901665291725091e-05, "loss": 0.7978, "num_input_tokens_seen": 14697232, "step": 25480 }, { "epoch": 3.7957998212689903, "grad_norm": 0.21154865622520447, "learning_rate": 3.901127112902519e-05, "loss": 0.8349, "num_input_tokens_seen": 14700080, "step": 25485 }, { "epoch": 3.7965445338099495, "grad_norm": 0.3774438202381134, "learning_rate": 3.9005888393974735e-05, "loss": 0.7937, "num_input_tokens_seen": 14703024, "step": 25490 }, { "epoch": 3.7972892463509087, "grad_norm": 0.24427631497383118, "learning_rate": 3.900050471246328e-05, "loss": 0.7968, "num_input_tokens_seen": 14705584, "step": 25495 }, { "epoch": 3.798033958891868, "grad_norm": 0.33328402042388916, "learning_rate": 3.899512008485464e-05, "loss": 0.8342, "num_input_tokens_seen": 14708432, "step": 25500 }, { "epoch": 3.798778671432827, "grad_norm": 0.18907515704631805, "learning_rate": 3.898973451151269e-05, "loss": 0.8076, "num_input_tokens_seen": 14711024, "step": 25505 }, { "epoch": 3.799523383973786, "grad_norm": 0.2890898287296295, "learning_rate": 3.8984347992801355e-05, "loss": 0.7682, "num_input_tokens_seen": 14713776, "step": 25510 }, { "epoch": 3.8002680965147455, "grad_norm": 0.34403499960899353, "learning_rate": 3.897896052908464e-05, "loss": 0.8145, "num_input_tokens_seen": 14716656, "step": 25515 }, { "epoch": 3.8010128090557043, "grad_norm": 0.2888762056827545, "learning_rate": 3.897357212072661e-05, "loss": 0.8208, "num_input_tokens_seen": 14719696, "step": 25520 }, { "epoch": 3.801757521596664, "grad_norm": 0.255197137594223, "learning_rate": 3.896818276809139e-05, "loss": 0.8062, "num_input_tokens_seen": 14722608, "step": 25525 }, { "epoch": 3.8025022341376227, "grad_norm": 0.23604848980903625, "learning_rate": 3.896279247154316e-05, "loss": 0.821, "num_input_tokens_seen": 14725520, "step": 25530 }, { "epoch": 3.8032469466785823, "grad_norm": 0.3423127830028534, "learning_rate": 3.8957401231446186e-05, "loss": 0.8137, "num_input_tokens_seen": 14728176, "step": 25535 }, { "epoch": 3.803991659219541, "grad_norm": 0.23658201098442078, "learning_rate": 3.895200904816478e-05, "loss": 0.8183, "num_input_tokens_seen": 14731024, "step": 25540 }, { "epoch": 3.8047363717605007, "grad_norm": 0.23569120466709137, "learning_rate": 3.8946615922063334e-05, "loss": 0.7872, "num_input_tokens_seen": 14733808, "step": 25545 }, { "epoch": 3.8054810843014595, "grad_norm": 0.2930655777454376, "learning_rate": 3.894122185350629e-05, "loss": 0.8024, "num_input_tokens_seen": 14737040, "step": 25550 }, { "epoch": 3.8062257968424187, "grad_norm": 0.23677504062652588, "learning_rate": 3.8935826842858144e-05, "loss": 0.7925, "num_input_tokens_seen": 14740016, "step": 25555 }, { "epoch": 3.806970509383378, "grad_norm": 0.19946153461933136, "learning_rate": 3.8930430890483486e-05, "loss": 0.764, "num_input_tokens_seen": 14743344, "step": 25560 }, { "epoch": 3.807715221924337, "grad_norm": 0.2819361984729767, "learning_rate": 3.892503399674694e-05, "loss": 0.8049, "num_input_tokens_seen": 14746096, "step": 25565 }, { "epoch": 3.8084599344652963, "grad_norm": 0.29372695088386536, "learning_rate": 3.8919636162013216e-05, "loss": 0.8077, "num_input_tokens_seen": 14748976, "step": 25570 }, { "epoch": 3.8092046470062555, "grad_norm": 0.23182663321495056, "learning_rate": 3.8914237386647076e-05, "loss": 0.8062, "num_input_tokens_seen": 14751664, "step": 25575 }, { "epoch": 3.8099493595472147, "grad_norm": 0.2922869324684143, "learning_rate": 3.8908837671013345e-05, "loss": 0.7969, "num_input_tokens_seen": 14754608, "step": 25580 }, { "epoch": 3.810694072088174, "grad_norm": 0.2015923708677292, "learning_rate": 3.8903437015476903e-05, "loss": 0.8081, "num_input_tokens_seen": 14757648, "step": 25585 }, { "epoch": 3.811438784629133, "grad_norm": 0.17046210169792175, "learning_rate": 3.889803542040272e-05, "loss": 0.8019, "num_input_tokens_seen": 14760240, "step": 25590 }, { "epoch": 3.8121834971700923, "grad_norm": 0.290178507566452, "learning_rate": 3.889263288615581e-05, "loss": 0.802, "num_input_tokens_seen": 14763280, "step": 25595 }, { "epoch": 3.8129282097110515, "grad_norm": 0.24569757282733917, "learning_rate": 3.888722941310126e-05, "loss": 0.8205, "num_input_tokens_seen": 14766224, "step": 25600 }, { "epoch": 3.8136729222520107, "grad_norm": 0.31648334860801697, "learning_rate": 3.88818250016042e-05, "loss": 0.792, "num_input_tokens_seen": 14769168, "step": 25605 }, { "epoch": 3.81441763479297, "grad_norm": 0.2577643096446991, "learning_rate": 3.887641965202984e-05, "loss": 0.7579, "num_input_tokens_seen": 14772240, "step": 25610 }, { "epoch": 3.815162347333929, "grad_norm": 0.2646186053752899, "learning_rate": 3.887101336474346e-05, "loss": 0.7972, "num_input_tokens_seen": 14774992, "step": 25615 }, { "epoch": 3.8159070598748883, "grad_norm": 0.2588546872138977, "learning_rate": 3.88656061401104e-05, "loss": 0.8448, "num_input_tokens_seen": 14778000, "step": 25620 }, { "epoch": 3.8166517724158475, "grad_norm": 0.2609604299068451, "learning_rate": 3.886019797849605e-05, "loss": 0.8191, "num_input_tokens_seen": 14780912, "step": 25625 }, { "epoch": 3.8173964849568067, "grad_norm": 0.2143697887659073, "learning_rate": 3.8854788880265865e-05, "loss": 0.8132, "num_input_tokens_seen": 14783952, "step": 25630 }, { "epoch": 3.818141197497766, "grad_norm": 0.16734430193901062, "learning_rate": 3.884937884578538e-05, "loss": 0.8052, "num_input_tokens_seen": 14786768, "step": 25635 }, { "epoch": 3.818885910038725, "grad_norm": 0.2246481478214264, "learning_rate": 3.884396787542017e-05, "loss": 0.8091, "num_input_tokens_seen": 14789520, "step": 25640 }, { "epoch": 3.8196306225796843, "grad_norm": 0.31096649169921875, "learning_rate": 3.8838555969535915e-05, "loss": 0.813, "num_input_tokens_seen": 14792752, "step": 25645 }, { "epoch": 3.8203753351206435, "grad_norm": 0.23948463797569275, "learning_rate": 3.8833143128498303e-05, "loss": 0.8378, "num_input_tokens_seen": 14795760, "step": 25650 }, { "epoch": 3.8211200476616027, "grad_norm": 0.31540167331695557, "learning_rate": 3.882772935267312e-05, "loss": 0.8167, "num_input_tokens_seen": 14798672, "step": 25655 }, { "epoch": 3.821864760202562, "grad_norm": 0.2272832691669464, "learning_rate": 3.8822314642426204e-05, "loss": 0.7918, "num_input_tokens_seen": 14801200, "step": 25660 }, { "epoch": 3.822609472743521, "grad_norm": 0.3048403263092041, "learning_rate": 3.8816898998123464e-05, "loss": 0.8141, "num_input_tokens_seen": 14804176, "step": 25665 }, { "epoch": 3.8233541852844803, "grad_norm": 0.2341378778219223, "learning_rate": 3.8811482420130866e-05, "loss": 0.8195, "num_input_tokens_seen": 14806992, "step": 25670 }, { "epoch": 3.824098897825439, "grad_norm": 0.2871454358100891, "learning_rate": 3.8806064908814435e-05, "loss": 0.7689, "num_input_tokens_seen": 14810000, "step": 25675 }, { "epoch": 3.8248436103663987, "grad_norm": 0.24761709570884705, "learning_rate": 3.880064646454027e-05, "loss": 0.7984, "num_input_tokens_seen": 14813008, "step": 25680 }, { "epoch": 3.8255883229073575, "grad_norm": 0.21108339726924896, "learning_rate": 3.8795227087674535e-05, "loss": 0.8137, "num_input_tokens_seen": 14816016, "step": 25685 }, { "epoch": 3.826333035448317, "grad_norm": 0.17876212298870087, "learning_rate": 3.878980677858344e-05, "loss": 0.8109, "num_input_tokens_seen": 14818608, "step": 25690 }, { "epoch": 3.827077747989276, "grad_norm": 0.251467227935791, "learning_rate": 3.878438553763326e-05, "loss": 0.8149, "num_input_tokens_seen": 14821392, "step": 25695 }, { "epoch": 3.8278224605302356, "grad_norm": 0.3131609261035919, "learning_rate": 3.877896336519035e-05, "loss": 0.8239, "num_input_tokens_seen": 14824528, "step": 25700 }, { "epoch": 3.8285671730711943, "grad_norm": 0.18117281794548035, "learning_rate": 3.877354026162112e-05, "loss": 0.7552, "num_input_tokens_seen": 14827536, "step": 25705 }, { "epoch": 3.829311885612154, "grad_norm": 0.16195711493492126, "learning_rate": 3.876811622729203e-05, "loss": 0.7925, "num_input_tokens_seen": 14830320, "step": 25710 }, { "epoch": 3.8300565981531127, "grad_norm": 0.25119641423225403, "learning_rate": 3.8762691262569625e-05, "loss": 0.7924, "num_input_tokens_seen": 14832944, "step": 25715 }, { "epoch": 3.830801310694072, "grad_norm": 0.3126581311225891, "learning_rate": 3.875726536782051e-05, "loss": 0.7802, "num_input_tokens_seen": 14835920, "step": 25720 }, { "epoch": 3.831546023235031, "grad_norm": 0.2940182387828827, "learning_rate": 3.8751838543411325e-05, "loss": 0.7775, "num_input_tokens_seen": 14838896, "step": 25725 }, { "epoch": 3.8322907357759903, "grad_norm": 0.22203393280506134, "learning_rate": 3.8746410789708806e-05, "loss": 0.822, "num_input_tokens_seen": 14841776, "step": 25730 }, { "epoch": 3.8330354483169495, "grad_norm": 0.21525779366493225, "learning_rate": 3.8740982107079735e-05, "loss": 0.8033, "num_input_tokens_seen": 14844784, "step": 25735 }, { "epoch": 3.8337801608579087, "grad_norm": 0.2583272159099579, "learning_rate": 3.873555249589096e-05, "loss": 0.7813, "num_input_tokens_seen": 14847792, "step": 25740 }, { "epoch": 3.834524873398868, "grad_norm": 0.21577736735343933, "learning_rate": 3.873012195650939e-05, "loss": 0.7971, "num_input_tokens_seen": 14850544, "step": 25745 }, { "epoch": 3.835269585939827, "grad_norm": 0.1943158656358719, "learning_rate": 3.8724690489302004e-05, "loss": 0.8158, "num_input_tokens_seen": 14853488, "step": 25750 }, { "epoch": 3.8360142984807863, "grad_norm": 0.28267544507980347, "learning_rate": 3.871925809463583e-05, "loss": 0.827, "num_input_tokens_seen": 14856336, "step": 25755 }, { "epoch": 3.8367590110217455, "grad_norm": 0.17836299538612366, "learning_rate": 3.871382477287797e-05, "loss": 0.8014, "num_input_tokens_seen": 14859120, "step": 25760 }, { "epoch": 3.8375037235627047, "grad_norm": 0.24070103466510773, "learning_rate": 3.87083905243956e-05, "loss": 0.801, "num_input_tokens_seen": 14862288, "step": 25765 }, { "epoch": 3.838248436103664, "grad_norm": 0.24763016402721405, "learning_rate": 3.8702955349555924e-05, "loss": 0.816, "num_input_tokens_seen": 14865136, "step": 25770 }, { "epoch": 3.838993148644623, "grad_norm": 0.6011995673179626, "learning_rate": 3.8697519248726236e-05, "loss": 0.7886, "num_input_tokens_seen": 14868272, "step": 25775 }, { "epoch": 3.8397378611855824, "grad_norm": 0.21714237332344055, "learning_rate": 3.869208222227389e-05, "loss": 0.825, "num_input_tokens_seen": 14870896, "step": 25780 }, { "epoch": 3.8404825737265416, "grad_norm": 0.29242637753486633, "learning_rate": 3.86866442705663e-05, "loss": 0.7984, "num_input_tokens_seen": 14873680, "step": 25785 }, { "epoch": 3.8412272862675008, "grad_norm": 0.2593306303024292, "learning_rate": 3.868120539397093e-05, "loss": 0.7819, "num_input_tokens_seen": 14876656, "step": 25790 }, { "epoch": 3.84197199880846, "grad_norm": 0.28694161772727966, "learning_rate": 3.867576559285533e-05, "loss": 0.8121, "num_input_tokens_seen": 14879184, "step": 25795 }, { "epoch": 3.842716711349419, "grad_norm": 0.22471432387828827, "learning_rate": 3.867032486758708e-05, "loss": 0.7914, "num_input_tokens_seen": 14881936, "step": 25800 }, { "epoch": 3.8434614238903784, "grad_norm": 0.2750178575515747, "learning_rate": 3.8664883218533873e-05, "loss": 0.7641, "num_input_tokens_seen": 14885136, "step": 25805 }, { "epoch": 3.8442061364313376, "grad_norm": 0.29101818799972534, "learning_rate": 3.8659440646063404e-05, "loss": 0.822, "num_input_tokens_seen": 14887856, "step": 25810 }, { "epoch": 3.8449508489722968, "grad_norm": 0.2451399862766266, "learning_rate": 3.865399715054347e-05, "loss": 0.8291, "num_input_tokens_seen": 14890512, "step": 25815 }, { "epoch": 3.845695561513256, "grad_norm": 0.266891211271286, "learning_rate": 3.8648552732341925e-05, "loss": 0.7687, "num_input_tokens_seen": 14893488, "step": 25820 }, { "epoch": 3.846440274054215, "grad_norm": 0.28806760907173157, "learning_rate": 3.8643107391826676e-05, "loss": 0.7902, "num_input_tokens_seen": 14896496, "step": 25825 }, { "epoch": 3.8471849865951744, "grad_norm": 0.24054118990898132, "learning_rate": 3.86376611293657e-05, "loss": 0.7921, "num_input_tokens_seen": 14899504, "step": 25830 }, { "epoch": 3.8479296991361336, "grad_norm": 0.30725911259651184, "learning_rate": 3.8632213945327036e-05, "loss": 0.8022, "num_input_tokens_seen": 14902192, "step": 25835 }, { "epoch": 3.848674411677093, "grad_norm": 0.3418007791042328, "learning_rate": 3.8626765840078765e-05, "loss": 0.8222, "num_input_tokens_seen": 14905424, "step": 25840 }, { "epoch": 3.849419124218052, "grad_norm": 0.1695583611726761, "learning_rate": 3.862131681398907e-05, "loss": 0.8001, "num_input_tokens_seen": 14908240, "step": 25845 }, { "epoch": 3.8501638367590107, "grad_norm": 0.24433909356594086, "learning_rate": 3.8615866867426164e-05, "loss": 0.7989, "num_input_tokens_seen": 14911024, "step": 25850 }, { "epoch": 3.8509085492999704, "grad_norm": 0.2832273840904236, "learning_rate": 3.8610416000758334e-05, "loss": 0.8078, "num_input_tokens_seen": 14914064, "step": 25855 }, { "epoch": 3.851653261840929, "grad_norm": 0.23307880759239197, "learning_rate": 3.860496421435392e-05, "loss": 0.8074, "num_input_tokens_seen": 14916912, "step": 25860 }, { "epoch": 3.852397974381889, "grad_norm": 0.35389062762260437, "learning_rate": 3.859951150858135e-05, "loss": 0.7967, "num_input_tokens_seen": 14919856, "step": 25865 }, { "epoch": 3.8531426869228476, "grad_norm": 0.23315659165382385, "learning_rate": 3.859405788380908e-05, "loss": 0.8022, "num_input_tokens_seen": 14922704, "step": 25870 }, { "epoch": 3.853887399463807, "grad_norm": 0.266282320022583, "learning_rate": 3.858860334040564e-05, "loss": 0.839, "num_input_tokens_seen": 14925872, "step": 25875 }, { "epoch": 3.854632112004766, "grad_norm": 0.2592754364013672, "learning_rate": 3.858314787873964e-05, "loss": 0.8485, "num_input_tokens_seen": 14928720, "step": 25880 }, { "epoch": 3.8553768245457256, "grad_norm": 0.3036995530128479, "learning_rate": 3.857769149917973e-05, "loss": 0.7994, "num_input_tokens_seen": 14931696, "step": 25885 }, { "epoch": 3.8561215370866844, "grad_norm": 0.20294739305973053, "learning_rate": 3.857223420209464e-05, "loss": 0.8145, "num_input_tokens_seen": 14934672, "step": 25890 }, { "epoch": 3.8568662496276436, "grad_norm": 0.1948404312133789, "learning_rate": 3.856677598785313e-05, "loss": 0.8125, "num_input_tokens_seen": 14937488, "step": 25895 }, { "epoch": 3.8576109621686028, "grad_norm": 0.1882021278142929, "learning_rate": 3.856131685682406e-05, "loss": 0.7798, "num_input_tokens_seen": 14940560, "step": 25900 }, { "epoch": 3.858355674709562, "grad_norm": 0.19779828190803528, "learning_rate": 3.855585680937634e-05, "loss": 0.8105, "num_input_tokens_seen": 14943536, "step": 25905 }, { "epoch": 3.859100387250521, "grad_norm": 0.30880606174468994, "learning_rate": 3.8550395845878925e-05, "loss": 0.8021, "num_input_tokens_seen": 14946480, "step": 25910 }, { "epoch": 3.8598450997914804, "grad_norm": 0.22809261083602905, "learning_rate": 3.854493396670085e-05, "loss": 0.7716, "num_input_tokens_seen": 14948976, "step": 25915 }, { "epoch": 3.8605898123324396, "grad_norm": 0.17664127051830292, "learning_rate": 3.8539471172211204e-05, "loss": 0.8068, "num_input_tokens_seen": 14951696, "step": 25920 }, { "epoch": 3.861334524873399, "grad_norm": 0.31189271807670593, "learning_rate": 3.8534007462779154e-05, "loss": 0.8309, "num_input_tokens_seen": 14954384, "step": 25925 }, { "epoch": 3.862079237414358, "grad_norm": 0.306907057762146, "learning_rate": 3.85285428387739e-05, "loss": 0.8032, "num_input_tokens_seen": 14957584, "step": 25930 }, { "epoch": 3.862823949955317, "grad_norm": 0.24158550798892975, "learning_rate": 3.852307730056472e-05, "loss": 0.8239, "num_input_tokens_seen": 14960464, "step": 25935 }, { "epoch": 3.8635686624962764, "grad_norm": 0.27762484550476074, "learning_rate": 3.851761084852096e-05, "loss": 0.7952, "num_input_tokens_seen": 14963568, "step": 25940 }, { "epoch": 3.8643133750372356, "grad_norm": 0.21721534430980682, "learning_rate": 3.851214348301202e-05, "loss": 0.8178, "num_input_tokens_seen": 14966288, "step": 25945 }, { "epoch": 3.865058087578195, "grad_norm": 0.315579354763031, "learning_rate": 3.850667520440735e-05, "loss": 0.7906, "num_input_tokens_seen": 14969008, "step": 25950 }, { "epoch": 3.865802800119154, "grad_norm": 0.24795688688755035, "learning_rate": 3.8501206013076494e-05, "loss": 0.8171, "num_input_tokens_seen": 14971824, "step": 25955 }, { "epoch": 3.866547512660113, "grad_norm": 0.22246068716049194, "learning_rate": 3.849573590938903e-05, "loss": 0.7919, "num_input_tokens_seen": 14974832, "step": 25960 }, { "epoch": 3.8672922252010724, "grad_norm": 0.26278096437454224, "learning_rate": 3.849026489371459e-05, "loss": 0.7706, "num_input_tokens_seen": 14977712, "step": 25965 }, { "epoch": 3.8680369377420316, "grad_norm": 0.24296778440475464, "learning_rate": 3.848479296642291e-05, "loss": 0.7808, "num_input_tokens_seen": 14980528, "step": 25970 }, { "epoch": 3.868781650282991, "grad_norm": 0.31806543469429016, "learning_rate": 3.8479320127883744e-05, "loss": 0.8185, "num_input_tokens_seen": 14983664, "step": 25975 }, { "epoch": 3.86952636282395, "grad_norm": 0.29053041338920593, "learning_rate": 3.8473846378466915e-05, "loss": 0.7936, "num_input_tokens_seen": 14986416, "step": 25980 }, { "epoch": 3.870271075364909, "grad_norm": 0.2891389727592468, "learning_rate": 3.846837171854234e-05, "loss": 0.8143, "num_input_tokens_seen": 14989200, "step": 25985 }, { "epoch": 3.8710157879058684, "grad_norm": 0.20119966566562653, "learning_rate": 3.8462896148479966e-05, "loss": 0.7941, "num_input_tokens_seen": 14992112, "step": 25990 }, { "epoch": 3.8717605004468276, "grad_norm": 0.22152359783649445, "learning_rate": 3.8457419668649795e-05, "loss": 0.799, "num_input_tokens_seen": 14994864, "step": 25995 }, { "epoch": 3.872505212987787, "grad_norm": 0.1812485307455063, "learning_rate": 3.845194227942192e-05, "loss": 0.8479, "num_input_tokens_seen": 14997744, "step": 26000 }, { "epoch": 3.873249925528746, "grad_norm": 0.2748044729232788, "learning_rate": 3.844646398116648e-05, "loss": 0.8298, "num_input_tokens_seen": 15000912, "step": 26005 }, { "epoch": 3.8739946380697052, "grad_norm": 0.26542654633522034, "learning_rate": 3.844098477425368e-05, "loss": 0.8161, "num_input_tokens_seen": 15003824, "step": 26010 }, { "epoch": 3.8747393506106644, "grad_norm": 0.2386380136013031, "learning_rate": 3.843550465905376e-05, "loss": 0.8026, "num_input_tokens_seen": 15006768, "step": 26015 }, { "epoch": 3.8754840631516236, "grad_norm": 0.18620017170906067, "learning_rate": 3.843002363593707e-05, "loss": 0.8106, "num_input_tokens_seen": 15009616, "step": 26020 }, { "epoch": 3.8762287756925824, "grad_norm": 0.4751608371734619, "learning_rate": 3.842454170527398e-05, "loss": 0.824, "num_input_tokens_seen": 15012272, "step": 26025 }, { "epoch": 3.876973488233542, "grad_norm": 0.370595246553421, "learning_rate": 3.841905886743494e-05, "loss": 0.7778, "num_input_tokens_seen": 15015152, "step": 26030 }, { "epoch": 3.877718200774501, "grad_norm": 0.2115550935268402, "learning_rate": 3.841357512279047e-05, "loss": 0.7931, "num_input_tokens_seen": 15018384, "step": 26035 }, { "epoch": 3.8784629133154604, "grad_norm": 0.2149980515241623, "learning_rate": 3.8408090471711125e-05, "loss": 0.7786, "num_input_tokens_seen": 15021040, "step": 26040 }, { "epoch": 3.879207625856419, "grad_norm": 0.32788604497909546, "learning_rate": 3.840260491456753e-05, "loss": 0.8636, "num_input_tokens_seen": 15023760, "step": 26045 }, { "epoch": 3.879952338397379, "grad_norm": 0.23034259676933289, "learning_rate": 3.83971184517304e-05, "loss": 0.8223, "num_input_tokens_seen": 15026384, "step": 26050 }, { "epoch": 3.8806970509383376, "grad_norm": 0.26213186979293823, "learning_rate": 3.8391631083570464e-05, "loss": 0.8301, "num_input_tokens_seen": 15029424, "step": 26055 }, { "epoch": 3.8814417634792973, "grad_norm": 0.22076921164989471, "learning_rate": 3.838614281045855e-05, "loss": 0.8352, "num_input_tokens_seen": 15032144, "step": 26060 }, { "epoch": 3.882186476020256, "grad_norm": 0.26233628392219543, "learning_rate": 3.838065363276553e-05, "loss": 0.8008, "num_input_tokens_seen": 15034768, "step": 26065 }, { "epoch": 3.882931188561215, "grad_norm": 0.26170921325683594, "learning_rate": 3.837516355086234e-05, "loss": 0.8228, "num_input_tokens_seen": 15037808, "step": 26070 }, { "epoch": 3.8836759011021744, "grad_norm": 0.22797426581382751, "learning_rate": 3.8369672565119975e-05, "loss": 0.8053, "num_input_tokens_seen": 15040624, "step": 26075 }, { "epoch": 3.8844206136431336, "grad_norm": 0.19113874435424805, "learning_rate": 3.836418067590949e-05, "loss": 0.8248, "num_input_tokens_seen": 15043440, "step": 26080 }, { "epoch": 3.885165326184093, "grad_norm": 0.22155365347862244, "learning_rate": 3.835868788360201e-05, "loss": 0.8076, "num_input_tokens_seen": 15046160, "step": 26085 }, { "epoch": 3.885910038725052, "grad_norm": 0.280650794506073, "learning_rate": 3.8353194188568725e-05, "loss": 0.8049, "num_input_tokens_seen": 15049104, "step": 26090 }, { "epoch": 3.8866547512660112, "grad_norm": 0.2405771166086197, "learning_rate": 3.8347699591180855e-05, "loss": 0.8004, "num_input_tokens_seen": 15052272, "step": 26095 }, { "epoch": 3.8873994638069704, "grad_norm": 0.2861451804637909, "learning_rate": 3.8342204091809716e-05, "loss": 0.8354, "num_input_tokens_seen": 15055248, "step": 26100 }, { "epoch": 3.8881441763479296, "grad_norm": 0.2400767207145691, "learning_rate": 3.8336707690826676e-05, "loss": 0.8104, "num_input_tokens_seen": 15058032, "step": 26105 }, { "epoch": 3.888888888888889, "grad_norm": 0.18079639971256256, "learning_rate": 3.8331210388603155e-05, "loss": 0.7848, "num_input_tokens_seen": 15060592, "step": 26110 }, { "epoch": 3.889633601429848, "grad_norm": 0.3435169458389282, "learning_rate": 3.8325712185510635e-05, "loss": 0.8122, "num_input_tokens_seen": 15063728, "step": 26115 }, { "epoch": 3.8903783139708072, "grad_norm": 0.1830989420413971, "learning_rate": 3.8320213081920664e-05, "loss": 0.8301, "num_input_tokens_seen": 15066704, "step": 26120 }, { "epoch": 3.8911230265117664, "grad_norm": 0.2729620933532715, "learning_rate": 3.831471307820485e-05, "loss": 0.8109, "num_input_tokens_seen": 15069584, "step": 26125 }, { "epoch": 3.8918677390527256, "grad_norm": 0.27542975544929504, "learning_rate": 3.8309212174734856e-05, "loss": 0.8272, "num_input_tokens_seen": 15072368, "step": 26130 }, { "epoch": 3.892612451593685, "grad_norm": 0.2376699298620224, "learning_rate": 3.8303710371882414e-05, "loss": 0.776, "num_input_tokens_seen": 15076432, "step": 26135 }, { "epoch": 3.893357164134644, "grad_norm": 0.22325041890144348, "learning_rate": 3.8298207670019315e-05, "loss": 0.8012, "num_input_tokens_seen": 15079216, "step": 26140 }, { "epoch": 3.8941018766756033, "grad_norm": 0.24427205324172974, "learning_rate": 3.82927040695174e-05, "loss": 0.81, "num_input_tokens_seen": 15082096, "step": 26145 }, { "epoch": 3.8948465892165625, "grad_norm": 0.22623823583126068, "learning_rate": 3.828719957074861e-05, "loss": 0.8155, "num_input_tokens_seen": 15084784, "step": 26150 }, { "epoch": 3.8955913017575217, "grad_norm": 0.17893531918525696, "learning_rate": 3.828169417408488e-05, "loss": 0.8168, "num_input_tokens_seen": 15087728, "step": 26155 }, { "epoch": 3.896336014298481, "grad_norm": 0.23922382295131683, "learning_rate": 3.8276187879898255e-05, "loss": 0.8206, "num_input_tokens_seen": 15090512, "step": 26160 }, { "epoch": 3.89708072683944, "grad_norm": 0.2515227198600769, "learning_rate": 3.827068068856083e-05, "loss": 0.793, "num_input_tokens_seen": 15093648, "step": 26165 }, { "epoch": 3.8978254393803993, "grad_norm": 0.24783265590667725, "learning_rate": 3.826517260044477e-05, "loss": 0.7961, "num_input_tokens_seen": 15096432, "step": 26170 }, { "epoch": 3.8985701519213585, "grad_norm": 0.22642308473587036, "learning_rate": 3.825966361592227e-05, "loss": 0.7825, "num_input_tokens_seen": 15099344, "step": 26175 }, { "epoch": 3.8993148644623177, "grad_norm": 0.24430431425571442, "learning_rate": 3.8254153735365614e-05, "loss": 0.804, "num_input_tokens_seen": 15102192, "step": 26180 }, { "epoch": 3.900059577003277, "grad_norm": 0.22461552917957306, "learning_rate": 3.8248642959147136e-05, "loss": 0.7895, "num_input_tokens_seen": 15104752, "step": 26185 }, { "epoch": 3.900804289544236, "grad_norm": 0.18273423612117767, "learning_rate": 3.8243131287639234e-05, "loss": 0.8029, "num_input_tokens_seen": 15107472, "step": 26190 }, { "epoch": 3.9015490020851953, "grad_norm": 0.25849974155426025, "learning_rate": 3.823761872121436e-05, "loss": 0.7901, "num_input_tokens_seen": 15110608, "step": 26195 }, { "epoch": 3.902293714626154, "grad_norm": 0.1728014349937439, "learning_rate": 3.823210526024503e-05, "loss": 0.8029, "num_input_tokens_seen": 15113392, "step": 26200 }, { "epoch": 3.9030384271671137, "grad_norm": 0.18744704127311707, "learning_rate": 3.822659090510383e-05, "loss": 0.8301, "num_input_tokens_seen": 15116208, "step": 26205 }, { "epoch": 3.9037831397080724, "grad_norm": 0.22969645261764526, "learning_rate": 3.822107565616339e-05, "loss": 0.7894, "num_input_tokens_seen": 15119024, "step": 26210 }, { "epoch": 3.904527852249032, "grad_norm": 0.343878835439682, "learning_rate": 3.8215559513796405e-05, "loss": 0.8038, "num_input_tokens_seen": 15121616, "step": 26215 }, { "epoch": 3.905272564789991, "grad_norm": 0.31320953369140625, "learning_rate": 3.821004247837564e-05, "loss": 0.836, "num_input_tokens_seen": 15124528, "step": 26220 }, { "epoch": 3.9060172773309505, "grad_norm": 0.21130773425102234, "learning_rate": 3.820452455027391e-05, "loss": 0.8032, "num_input_tokens_seen": 15127856, "step": 26225 }, { "epoch": 3.9067619898719093, "grad_norm": 0.2753387689590454, "learning_rate": 3.819900572986411e-05, "loss": 0.8099, "num_input_tokens_seen": 15130800, "step": 26230 }, { "epoch": 3.907506702412869, "grad_norm": 0.2110828161239624, "learning_rate": 3.8193486017519157e-05, "loss": 0.8148, "num_input_tokens_seen": 15133680, "step": 26235 }, { "epoch": 3.9082514149538277, "grad_norm": 0.2688025236129761, "learning_rate": 3.818796541361206e-05, "loss": 0.8017, "num_input_tokens_seen": 15136880, "step": 26240 }, { "epoch": 3.908996127494787, "grad_norm": 0.2824251353740692, "learning_rate": 3.8182443918515874e-05, "loss": 0.798, "num_input_tokens_seen": 15139952, "step": 26245 }, { "epoch": 3.909740840035746, "grad_norm": 0.21984122693538666, "learning_rate": 3.817692153260374e-05, "loss": 0.7738, "num_input_tokens_seen": 15142672, "step": 26250 }, { "epoch": 3.9104855525767053, "grad_norm": 0.32244673371315, "learning_rate": 3.817139825624881e-05, "loss": 0.7729, "num_input_tokens_seen": 15145520, "step": 26255 }, { "epoch": 3.9112302651176645, "grad_norm": 0.26643070578575134, "learning_rate": 3.8165874089824336e-05, "loss": 0.7912, "num_input_tokens_seen": 15148432, "step": 26260 }, { "epoch": 3.9119749776586237, "grad_norm": 0.22958685457706451, "learning_rate": 3.816034903370362e-05, "loss": 0.7629, "num_input_tokens_seen": 15151504, "step": 26265 }, { "epoch": 3.912719690199583, "grad_norm": 0.30174463987350464, "learning_rate": 3.8154823088260026e-05, "loss": 0.8287, "num_input_tokens_seen": 15154320, "step": 26270 }, { "epoch": 3.913464402740542, "grad_norm": 0.18887268006801605, "learning_rate": 3.8149296253866975e-05, "loss": 0.7688, "num_input_tokens_seen": 15157456, "step": 26275 }, { "epoch": 3.9142091152815013, "grad_norm": 0.23736976087093353, "learning_rate": 3.8143768530897935e-05, "loss": 0.7866, "num_input_tokens_seen": 15160528, "step": 26280 }, { "epoch": 3.9149538278224605, "grad_norm": 0.19765572249889374, "learning_rate": 3.813823991972646e-05, "loss": 0.7837, "num_input_tokens_seen": 15163184, "step": 26285 }, { "epoch": 3.9156985403634197, "grad_norm": 0.27827775478363037, "learning_rate": 3.8132710420726146e-05, "loss": 0.845, "num_input_tokens_seen": 15165904, "step": 26290 }, { "epoch": 3.916443252904379, "grad_norm": 0.23155514895915985, "learning_rate": 3.812718003427066e-05, "loss": 0.8123, "num_input_tokens_seen": 15168624, "step": 26295 }, { "epoch": 3.917187965445338, "grad_norm": 0.21884086728096008, "learning_rate": 3.812164876073371e-05, "loss": 0.8064, "num_input_tokens_seen": 15171440, "step": 26300 }, { "epoch": 3.9179326779862973, "grad_norm": 0.25546807050704956, "learning_rate": 3.8116116600489096e-05, "loss": 0.8013, "num_input_tokens_seen": 15174352, "step": 26305 }, { "epoch": 3.9186773905272565, "grad_norm": 0.23348063230514526, "learning_rate": 3.8110583553910644e-05, "loss": 0.8144, "num_input_tokens_seen": 15177328, "step": 26310 }, { "epoch": 3.9194221030682157, "grad_norm": 0.22731103003025055, "learning_rate": 3.810504962137226e-05, "loss": 0.7933, "num_input_tokens_seen": 15180080, "step": 26315 }, { "epoch": 3.920166815609175, "grad_norm": 0.36562037467956543, "learning_rate": 3.8099514803247905e-05, "loss": 0.8391, "num_input_tokens_seen": 15183024, "step": 26320 }, { "epoch": 3.920911528150134, "grad_norm": 0.44505080580711365, "learning_rate": 3.809397909991159e-05, "loss": 0.7968, "num_input_tokens_seen": 15185712, "step": 26325 }, { "epoch": 3.9216562406910933, "grad_norm": 0.23487158119678497, "learning_rate": 3.808844251173741e-05, "loss": 0.8025, "num_input_tokens_seen": 15188336, "step": 26330 }, { "epoch": 3.9224009532320525, "grad_norm": 0.3121846616268158, "learning_rate": 3.8082905039099496e-05, "loss": 0.8411, "num_input_tokens_seen": 15191056, "step": 26335 }, { "epoch": 3.9231456657730117, "grad_norm": 0.23777425289154053, "learning_rate": 3.8077366682372056e-05, "loss": 0.8003, "num_input_tokens_seen": 15193680, "step": 26340 }, { "epoch": 3.923890378313971, "grad_norm": 0.2206432968378067, "learning_rate": 3.807182744192934e-05, "loss": 0.8066, "num_input_tokens_seen": 15196432, "step": 26345 }, { "epoch": 3.92463509085493, "grad_norm": 0.21310093998908997, "learning_rate": 3.806628731814568e-05, "loss": 0.8218, "num_input_tokens_seen": 15199120, "step": 26350 }, { "epoch": 3.9253798033958893, "grad_norm": 0.34591054916381836, "learning_rate": 3.806074631139543e-05, "loss": 0.8139, "num_input_tokens_seen": 15201936, "step": 26355 }, { "epoch": 3.9261245159368485, "grad_norm": 0.24210605025291443, "learning_rate": 3.805520442205306e-05, "loss": 0.8004, "num_input_tokens_seen": 15205040, "step": 26360 }, { "epoch": 3.9268692284778077, "grad_norm": 0.25250038504600525, "learning_rate": 3.804966165049304e-05, "loss": 0.8051, "num_input_tokens_seen": 15207632, "step": 26365 }, { "epoch": 3.927613941018767, "grad_norm": 0.22158093750476837, "learning_rate": 3.8044117997089954e-05, "loss": 0.8098, "num_input_tokens_seen": 15210320, "step": 26370 }, { "epoch": 3.9283586535597257, "grad_norm": 0.20190168917179108, "learning_rate": 3.803857346221841e-05, "loss": 0.8034, "num_input_tokens_seen": 15213296, "step": 26375 }, { "epoch": 3.9291033661006853, "grad_norm": 0.2524767518043518, "learning_rate": 3.803302804625307e-05, "loss": 0.8022, "num_input_tokens_seen": 15216112, "step": 26380 }, { "epoch": 3.929848078641644, "grad_norm": 0.23596414923667908, "learning_rate": 3.80274817495687e-05, "loss": 0.7797, "num_input_tokens_seen": 15218864, "step": 26385 }, { "epoch": 3.9305927911826037, "grad_norm": 0.2695066034793854, "learning_rate": 3.8021934572540065e-05, "loss": 0.7982, "num_input_tokens_seen": 15221904, "step": 26390 }, { "epoch": 3.9313375037235625, "grad_norm": 0.21839332580566406, "learning_rate": 3.8016386515542035e-05, "loss": 0.8027, "num_input_tokens_seen": 15224656, "step": 26395 }, { "epoch": 3.932082216264522, "grad_norm": 0.31427478790283203, "learning_rate": 3.8010837578949527e-05, "loss": 0.8054, "num_input_tokens_seen": 15227344, "step": 26400 }, { "epoch": 3.932826928805481, "grad_norm": 0.218422070145607, "learning_rate": 3.800528776313752e-05, "loss": 0.8464, "num_input_tokens_seen": 15230320, "step": 26405 }, { "epoch": 3.9335716413464406, "grad_norm": 0.25478363037109375, "learning_rate": 3.799973706848103e-05, "loss": 0.7957, "num_input_tokens_seen": 15233232, "step": 26410 }, { "epoch": 3.9343163538873993, "grad_norm": 0.2791730761528015, "learning_rate": 3.799418549535517e-05, "loss": 0.799, "num_input_tokens_seen": 15236144, "step": 26415 }, { "epoch": 3.9350610664283585, "grad_norm": 0.20848946273326874, "learning_rate": 3.798863304413509e-05, "loss": 0.823, "num_input_tokens_seen": 15239376, "step": 26420 }, { "epoch": 3.9358057789693177, "grad_norm": 0.34655672311782837, "learning_rate": 3.7983079715195984e-05, "loss": 0.819, "num_input_tokens_seen": 15242128, "step": 26425 }, { "epoch": 3.936550491510277, "grad_norm": 0.22085516154766083, "learning_rate": 3.7977525508913145e-05, "loss": 0.7882, "num_input_tokens_seen": 15245040, "step": 26430 }, { "epoch": 3.937295204051236, "grad_norm": 0.22972001135349274, "learning_rate": 3.797197042566189e-05, "loss": 0.8194, "num_input_tokens_seen": 15248368, "step": 26435 }, { "epoch": 3.9380399165921953, "grad_norm": 0.1995040327310562, "learning_rate": 3.796641446581762e-05, "loss": 0.8119, "num_input_tokens_seen": 15250992, "step": 26440 }, { "epoch": 3.9387846291331545, "grad_norm": 0.18921248614788055, "learning_rate": 3.796085762975577e-05, "loss": 0.7831, "num_input_tokens_seen": 15253968, "step": 26445 }, { "epoch": 3.9395293416741137, "grad_norm": 0.2941458523273468, "learning_rate": 3.7955299917851864e-05, "loss": 0.813, "num_input_tokens_seen": 15256944, "step": 26450 }, { "epoch": 3.940274054215073, "grad_norm": 0.33936968445777893, "learning_rate": 3.794974133048146e-05, "loss": 0.8117, "num_input_tokens_seen": 15259504, "step": 26455 }, { "epoch": 3.941018766756032, "grad_norm": 0.23883619904518127, "learning_rate": 3.794418186802018e-05, "loss": 0.785, "num_input_tokens_seen": 15262320, "step": 26460 }, { "epoch": 3.9417634792969913, "grad_norm": 0.2990007698535919, "learning_rate": 3.793862153084372e-05, "loss": 0.7867, "num_input_tokens_seen": 15264880, "step": 26465 }, { "epoch": 3.9425081918379505, "grad_norm": 0.27131417393684387, "learning_rate": 3.793306031932783e-05, "loss": 0.8227, "num_input_tokens_seen": 15267856, "step": 26470 }, { "epoch": 3.9432529043789097, "grad_norm": 0.27832484245300293, "learning_rate": 3.79274982338483e-05, "loss": 0.8116, "num_input_tokens_seen": 15270800, "step": 26475 }, { "epoch": 3.943997616919869, "grad_norm": 0.2867814898490906, "learning_rate": 3.7921935274780994e-05, "loss": 0.7779, "num_input_tokens_seen": 15273936, "step": 26480 }, { "epoch": 3.944742329460828, "grad_norm": 0.29579851031303406, "learning_rate": 3.791637144250184e-05, "loss": 0.8135, "num_input_tokens_seen": 15276816, "step": 26485 }, { "epoch": 3.9454870420017873, "grad_norm": 0.2517038583755493, "learning_rate": 3.791080673738682e-05, "loss": 0.8085, "num_input_tokens_seen": 15279632, "step": 26490 }, { "epoch": 3.9462317545427466, "grad_norm": 0.20940160751342773, "learning_rate": 3.790524115981198e-05, "loss": 0.8317, "num_input_tokens_seen": 15282288, "step": 26495 }, { "epoch": 3.9469764670837058, "grad_norm": 0.288679838180542, "learning_rate": 3.78996747101534e-05, "loss": 0.7909, "num_input_tokens_seen": 15285008, "step": 26500 }, { "epoch": 3.947721179624665, "grad_norm": 0.23995891213417053, "learning_rate": 3.789410738878726e-05, "loss": 0.8074, "num_input_tokens_seen": 15287728, "step": 26505 }, { "epoch": 3.948465892165624, "grad_norm": 0.18561023473739624, "learning_rate": 3.7888539196089755e-05, "loss": 0.8256, "num_input_tokens_seen": 15290384, "step": 26510 }, { "epoch": 3.9492106047065834, "grad_norm": 0.24035708606243134, "learning_rate": 3.788297013243718e-05, "loss": 0.8267, "num_input_tokens_seen": 15293136, "step": 26515 }, { "epoch": 3.9499553172475426, "grad_norm": 0.37016522884368896, "learning_rate": 3.7877400198205856e-05, "loss": 0.8287, "num_input_tokens_seen": 15296176, "step": 26520 }, { "epoch": 3.9507000297885018, "grad_norm": 0.2989571690559387, "learning_rate": 3.7871829393772185e-05, "loss": 0.8186, "num_input_tokens_seen": 15298800, "step": 26525 }, { "epoch": 3.951444742329461, "grad_norm": 0.2575433552265167, "learning_rate": 3.786625771951261e-05, "loss": 0.8026, "num_input_tokens_seen": 15301680, "step": 26530 }, { "epoch": 3.95218945487042, "grad_norm": 0.21547481417655945, "learning_rate": 3.7860685175803654e-05, "loss": 0.8065, "num_input_tokens_seen": 15304432, "step": 26535 }, { "epoch": 3.9529341674113794, "grad_norm": 0.19856418669223785, "learning_rate": 3.785511176302189e-05, "loss": 0.8086, "num_input_tokens_seen": 15307184, "step": 26540 }, { "epoch": 3.9536788799523386, "grad_norm": 0.2607266306877136, "learning_rate": 3.784953748154393e-05, "loss": 0.8002, "num_input_tokens_seen": 15309968, "step": 26545 }, { "epoch": 3.9544235924932973, "grad_norm": 0.277256041765213, "learning_rate": 3.784396233174647e-05, "loss": 0.8037, "num_input_tokens_seen": 15313008, "step": 26550 }, { "epoch": 3.955168305034257, "grad_norm": 0.18678653240203857, "learning_rate": 3.7838386314006256e-05, "loss": 0.8147, "num_input_tokens_seen": 15315824, "step": 26555 }, { "epoch": 3.9559130175752157, "grad_norm": 0.19545838236808777, "learning_rate": 3.78328094287001e-05, "loss": 0.7749, "num_input_tokens_seen": 15318576, "step": 26560 }, { "epoch": 3.9566577301161754, "grad_norm": 0.2478572279214859, "learning_rate": 3.782723167620484e-05, "loss": 0.7784, "num_input_tokens_seen": 15321456, "step": 26565 }, { "epoch": 3.957402442657134, "grad_norm": 0.3154723346233368, "learning_rate": 3.782165305689743e-05, "loss": 0.7956, "num_input_tokens_seen": 15324528, "step": 26570 }, { "epoch": 3.958147155198094, "grad_norm": 0.2601941227912903, "learning_rate": 3.781607357115483e-05, "loss": 0.8116, "num_input_tokens_seen": 15327472, "step": 26575 }, { "epoch": 3.9588918677390526, "grad_norm": 0.26005756855010986, "learning_rate": 3.7810493219354083e-05, "loss": 0.83, "num_input_tokens_seen": 15330064, "step": 26580 }, { "epoch": 3.9596365802800118, "grad_norm": 0.19499342143535614, "learning_rate": 3.780491200187228e-05, "loss": 0.8207, "num_input_tokens_seen": 15333136, "step": 26585 }, { "epoch": 3.960381292820971, "grad_norm": 0.30651921033859253, "learning_rate": 3.77993299190866e-05, "loss": 0.8819, "num_input_tokens_seen": 15335888, "step": 26590 }, { "epoch": 3.96112600536193, "grad_norm": 0.25179076194763184, "learning_rate": 3.7793746971374236e-05, "loss": 0.8283, "num_input_tokens_seen": 15338864, "step": 26595 }, { "epoch": 3.9618707179028894, "grad_norm": 0.2115222066640854, "learning_rate": 3.7788163159112467e-05, "loss": 0.8087, "num_input_tokens_seen": 15341616, "step": 26600 }, { "epoch": 3.9626154304438486, "grad_norm": 0.20072582364082336, "learning_rate": 3.778257848267863e-05, "loss": 0.8183, "num_input_tokens_seen": 15344496, "step": 26605 }, { "epoch": 3.9633601429848078, "grad_norm": 0.26888513565063477, "learning_rate": 3.7776992942450097e-05, "loss": 0.8094, "num_input_tokens_seen": 15347376, "step": 26610 }, { "epoch": 3.964104855525767, "grad_norm": 0.24887438118457794, "learning_rate": 3.777140653880434e-05, "loss": 0.8179, "num_input_tokens_seen": 15350224, "step": 26615 }, { "epoch": 3.964849568066726, "grad_norm": 0.18591101467609406, "learning_rate": 3.776581927211885e-05, "loss": 0.804, "num_input_tokens_seen": 15352848, "step": 26620 }, { "epoch": 3.9655942806076854, "grad_norm": 0.3212965428829193, "learning_rate": 3.7760231142771194e-05, "loss": 0.8155, "num_input_tokens_seen": 15355632, "step": 26625 }, { "epoch": 3.9663389931486446, "grad_norm": 0.20651671290397644, "learning_rate": 3.7754642151139e-05, "loss": 0.7798, "num_input_tokens_seen": 15358640, "step": 26630 }, { "epoch": 3.967083705689604, "grad_norm": 0.23945774137973785, "learning_rate": 3.774905229759994e-05, "loss": 0.811, "num_input_tokens_seen": 15361488, "step": 26635 }, { "epoch": 3.967828418230563, "grad_norm": 0.2567587196826935, "learning_rate": 3.7743461582531767e-05, "loss": 0.8095, "num_input_tokens_seen": 15364240, "step": 26640 }, { "epoch": 3.968573130771522, "grad_norm": 0.22865912318229675, "learning_rate": 3.773787000631226e-05, "loss": 0.8301, "num_input_tokens_seen": 15367056, "step": 26645 }, { "epoch": 3.9693178433124814, "grad_norm": 0.2645789086818695, "learning_rate": 3.77322775693193e-05, "loss": 0.8228, "num_input_tokens_seen": 15369904, "step": 26650 }, { "epoch": 3.9700625558534406, "grad_norm": 0.26726168394088745, "learning_rate": 3.772668427193078e-05, "loss": 0.8208, "num_input_tokens_seen": 15372752, "step": 26655 }, { "epoch": 3.9708072683944, "grad_norm": 0.26297110319137573, "learning_rate": 3.772109011452468e-05, "loss": 0.7925, "num_input_tokens_seen": 15375472, "step": 26660 }, { "epoch": 3.971551980935359, "grad_norm": 0.21050429344177246, "learning_rate": 3.771549509747903e-05, "loss": 0.8114, "num_input_tokens_seen": 15378640, "step": 26665 }, { "epoch": 3.972296693476318, "grad_norm": 0.2833091914653778, "learning_rate": 3.7709899221171924e-05, "loss": 0.8081, "num_input_tokens_seen": 15381616, "step": 26670 }, { "epoch": 3.9730414060172774, "grad_norm": 0.2285328209400177, "learning_rate": 3.7704302485981504e-05, "loss": 0.7908, "num_input_tokens_seen": 15384464, "step": 26675 }, { "epoch": 3.9737861185582366, "grad_norm": 0.19873858988285065, "learning_rate": 3.769870489228596e-05, "loss": 0.8283, "num_input_tokens_seen": 15387472, "step": 26680 }, { "epoch": 3.974530831099196, "grad_norm": 0.2931506931781769, "learning_rate": 3.769310644046359e-05, "loss": 0.819, "num_input_tokens_seen": 15390608, "step": 26685 }, { "epoch": 3.975275543640155, "grad_norm": 0.22161324322223663, "learning_rate": 3.768750713089267e-05, "loss": 0.7678, "num_input_tokens_seen": 15393392, "step": 26690 }, { "epoch": 3.976020256181114, "grad_norm": 0.22708828747272491, "learning_rate": 3.768190696395162e-05, "loss": 0.7695, "num_input_tokens_seen": 15396272, "step": 26695 }, { "epoch": 3.9767649687220734, "grad_norm": 0.2998645305633545, "learning_rate": 3.767630594001885e-05, "loss": 0.8357, "num_input_tokens_seen": 15398896, "step": 26700 }, { "epoch": 3.9775096812630326, "grad_norm": 0.21489036083221436, "learning_rate": 3.767070405947287e-05, "loss": 0.8506, "num_input_tokens_seen": 15401936, "step": 26705 }, { "epoch": 3.978254393803992, "grad_norm": 0.19630306959152222, "learning_rate": 3.7665101322692206e-05, "loss": 0.785, "num_input_tokens_seen": 15404880, "step": 26710 }, { "epoch": 3.9789991063449506, "grad_norm": 0.22364725172519684, "learning_rate": 3.765949773005551e-05, "loss": 0.7953, "num_input_tokens_seen": 15407760, "step": 26715 }, { "epoch": 3.9797438188859102, "grad_norm": 0.1734553724527359, "learning_rate": 3.7653893281941425e-05, "loss": 0.8402, "num_input_tokens_seen": 15410864, "step": 26720 }, { "epoch": 3.980488531426869, "grad_norm": 0.24320657551288605, "learning_rate": 3.764828797872866e-05, "loss": 0.801, "num_input_tokens_seen": 15413680, "step": 26725 }, { "epoch": 3.9812332439678286, "grad_norm": 0.28933629393577576, "learning_rate": 3.764268182079603e-05, "loss": 0.81, "num_input_tokens_seen": 15416592, "step": 26730 }, { "epoch": 3.9819779565087874, "grad_norm": 0.16940359771251678, "learning_rate": 3.7637074808522365e-05, "loss": 0.8168, "num_input_tokens_seen": 15419792, "step": 26735 }, { "epoch": 3.982722669049747, "grad_norm": 0.22991935908794403, "learning_rate": 3.763146694228657e-05, "loss": 0.7725, "num_input_tokens_seen": 15422352, "step": 26740 }, { "epoch": 3.983467381590706, "grad_norm": 0.26800981163978577, "learning_rate": 3.762585822246758e-05, "loss": 0.8063, "num_input_tokens_seen": 15425232, "step": 26745 }, { "epoch": 3.9842120941316654, "grad_norm": 0.229943186044693, "learning_rate": 3.762024864944443e-05, "loss": 0.803, "num_input_tokens_seen": 15428016, "step": 26750 }, { "epoch": 3.984956806672624, "grad_norm": 0.2313077598810196, "learning_rate": 3.761463822359619e-05, "loss": 0.8164, "num_input_tokens_seen": 15430576, "step": 26755 }, { "epoch": 3.9857015192135834, "grad_norm": 0.24733231961727142, "learning_rate": 3.760902694530198e-05, "loss": 0.8107, "num_input_tokens_seen": 15433424, "step": 26760 }, { "epoch": 3.9864462317545426, "grad_norm": 0.28541722893714905, "learning_rate": 3.7603414814940995e-05, "loss": 0.7802, "num_input_tokens_seen": 15436528, "step": 26765 }, { "epoch": 3.987190944295502, "grad_norm": 0.3041996955871582, "learning_rate": 3.7597801832892475e-05, "loss": 0.78, "num_input_tokens_seen": 15439600, "step": 26770 }, { "epoch": 3.987935656836461, "grad_norm": 0.20768791437149048, "learning_rate": 3.759218799953574e-05, "loss": 0.8259, "num_input_tokens_seen": 15442416, "step": 26775 }, { "epoch": 3.98868036937742, "grad_norm": 0.22059716284275055, "learning_rate": 3.758657331525012e-05, "loss": 0.8083, "num_input_tokens_seen": 15445392, "step": 26780 }, { "epoch": 3.9894250819183794, "grad_norm": 0.20119425654411316, "learning_rate": 3.758095778041506e-05, "loss": 0.8007, "num_input_tokens_seen": 15448272, "step": 26785 }, { "epoch": 3.9901697944593386, "grad_norm": 0.15052157640457153, "learning_rate": 3.757534139541002e-05, "loss": 0.7869, "num_input_tokens_seen": 15451184, "step": 26790 }, { "epoch": 3.990914507000298, "grad_norm": 0.20497524738311768, "learning_rate": 3.7569724160614536e-05, "loss": 0.7819, "num_input_tokens_seen": 15454256, "step": 26795 }, { "epoch": 3.991659219541257, "grad_norm": 0.1992855966091156, "learning_rate": 3.75641060764082e-05, "loss": 0.7868, "num_input_tokens_seen": 15456848, "step": 26800 }, { "epoch": 3.9924039320822162, "grad_norm": 0.2444307804107666, "learning_rate": 3.755848714317065e-05, "loss": 0.8087, "num_input_tokens_seen": 15459536, "step": 26805 }, { "epoch": 3.9931486446231754, "grad_norm": 0.2767347991466522, "learning_rate": 3.75528673612816e-05, "loss": 0.7972, "num_input_tokens_seen": 15462640, "step": 26810 }, { "epoch": 3.9938933571641346, "grad_norm": 0.3126620352268219, "learning_rate": 3.7547246731120816e-05, "loss": 0.7912, "num_input_tokens_seen": 15465584, "step": 26815 }, { "epoch": 3.994638069705094, "grad_norm": 0.21175144612789154, "learning_rate": 3.7541625253068117e-05, "loss": 0.8412, "num_input_tokens_seen": 15468592, "step": 26820 }, { "epoch": 3.995382782246053, "grad_norm": 0.23772208392620087, "learning_rate": 3.7536002927503354e-05, "loss": 0.8142, "num_input_tokens_seen": 15471408, "step": 26825 }, { "epoch": 3.9961274947870122, "grad_norm": 0.2879856824874878, "learning_rate": 3.7530379754806494e-05, "loss": 0.8261, "num_input_tokens_seen": 15474384, "step": 26830 }, { "epoch": 3.9968722073279714, "grad_norm": 0.2738114893436432, "learning_rate": 3.752475573535752e-05, "loss": 0.828, "num_input_tokens_seen": 15477264, "step": 26835 }, { "epoch": 3.9976169198689306, "grad_norm": 0.25377994775772095, "learning_rate": 3.7519130869536465e-05, "loss": 0.8278, "num_input_tokens_seen": 15480272, "step": 26840 }, { "epoch": 3.99836163240989, "grad_norm": 0.20941676199436188, "learning_rate": 3.751350515772344e-05, "loss": 0.7852, "num_input_tokens_seen": 15483024, "step": 26845 }, { "epoch": 3.999106344950849, "grad_norm": 0.2068987637758255, "learning_rate": 3.7507878600298626e-05, "loss": 0.8047, "num_input_tokens_seen": 15485680, "step": 26850 }, { "epoch": 3.9998510574918082, "grad_norm": 0.27976974844932556, "learning_rate": 3.750225119764223e-05, "loss": 0.7798, "num_input_tokens_seen": 15488912, "step": 26855 }, { "epoch": 4.0, "eval_loss": 0.8032766580581665, "eval_runtime": 45.4465, "eval_samples_per_second": 65.66, "eval_steps_per_second": 16.415, "num_input_tokens_seen": 15489040, "step": 26856 }, { "epoch": 4.000595770032767, "grad_norm": 0.24828214943408966, "learning_rate": 3.749662295013452e-05, "loss": 0.7943, "num_input_tokens_seen": 15491568, "step": 26860 }, { "epoch": 4.001340482573727, "grad_norm": 0.18625035881996155, "learning_rate": 3.7490993858155837e-05, "loss": 0.8057, "num_input_tokens_seen": 15494608, "step": 26865 }, { "epoch": 4.002085195114685, "grad_norm": 0.3153831958770752, "learning_rate": 3.748536392208658e-05, "loss": 0.7874, "num_input_tokens_seen": 15497744, "step": 26870 }, { "epoch": 4.002829907655645, "grad_norm": 0.31354063749313354, "learning_rate": 3.74797331423072e-05, "loss": 0.7986, "num_input_tokens_seen": 15500624, "step": 26875 }, { "epoch": 4.003574620196604, "grad_norm": 0.29385828971862793, "learning_rate": 3.747410151919817e-05, "loss": 0.7985, "num_input_tokens_seen": 15503856, "step": 26880 }, { "epoch": 4.0043193327375635, "grad_norm": 0.28536340594291687, "learning_rate": 3.746846905314009e-05, "loss": 0.8152, "num_input_tokens_seen": 15506704, "step": 26885 }, { "epoch": 4.005064045278522, "grad_norm": 0.22369904816150665, "learning_rate": 3.746283574451356e-05, "loss": 0.8014, "num_input_tokens_seen": 15509488, "step": 26890 }, { "epoch": 4.005808757819482, "grad_norm": 0.23674587905406952, "learning_rate": 3.7457201593699264e-05, "loss": 0.7908, "num_input_tokens_seen": 15512272, "step": 26895 }, { "epoch": 4.006553470360441, "grad_norm": 0.23399114608764648, "learning_rate": 3.7451566601077936e-05, "loss": 0.778, "num_input_tokens_seen": 15515120, "step": 26900 }, { "epoch": 4.0072981829014, "grad_norm": 0.24551719427108765, "learning_rate": 3.744593076703035e-05, "loss": 0.792, "num_input_tokens_seen": 15517936, "step": 26905 }, { "epoch": 4.008042895442359, "grad_norm": 0.24194103479385376, "learning_rate": 3.744029409193737e-05, "loss": 0.803, "num_input_tokens_seen": 15520624, "step": 26910 }, { "epoch": 4.008787607983319, "grad_norm": 0.22178111970424652, "learning_rate": 3.7434656576179894e-05, "loss": 0.7653, "num_input_tokens_seen": 15523600, "step": 26915 }, { "epoch": 4.009532320524277, "grad_norm": 0.22250518202781677, "learning_rate": 3.742901822013889e-05, "loss": 0.7995, "num_input_tokens_seen": 15526320, "step": 26920 }, { "epoch": 4.010277033065237, "grad_norm": 0.4243949353694916, "learning_rate": 3.7423379024195355e-05, "loss": 0.7966, "num_input_tokens_seen": 15529264, "step": 26925 }, { "epoch": 4.011021745606196, "grad_norm": 0.3335070013999939, "learning_rate": 3.7417738988730375e-05, "loss": 0.8014, "num_input_tokens_seen": 15532048, "step": 26930 }, { "epoch": 4.0117664581471555, "grad_norm": 0.24481037259101868, "learning_rate": 3.7412098114125094e-05, "loss": 0.7783, "num_input_tokens_seen": 15534864, "step": 26935 }, { "epoch": 4.012511170688114, "grad_norm": 0.28999435901641846, "learning_rate": 3.740645640076068e-05, "loss": 0.825, "num_input_tokens_seen": 15537776, "step": 26940 }, { "epoch": 4.013255883229074, "grad_norm": 0.207972452044487, "learning_rate": 3.740081384901837e-05, "loss": 0.8446, "num_input_tokens_seen": 15540560, "step": 26945 }, { "epoch": 4.014000595770033, "grad_norm": 0.2321840226650238, "learning_rate": 3.7395170459279494e-05, "loss": 0.8117, "num_input_tokens_seen": 15543568, "step": 26950 }, { "epoch": 4.014745308310992, "grad_norm": 0.34138160943984985, "learning_rate": 3.738952623192539e-05, "loss": 0.7716, "num_input_tokens_seen": 15546544, "step": 26955 }, { "epoch": 4.015490020851951, "grad_norm": 0.24380025267601013, "learning_rate": 3.738388116733748e-05, "loss": 0.7822, "num_input_tokens_seen": 15549744, "step": 26960 }, { "epoch": 4.016234733392911, "grad_norm": 0.2757631242275238, "learning_rate": 3.737823526589722e-05, "loss": 0.8559, "num_input_tokens_seen": 15552912, "step": 26965 }, { "epoch": 4.0169794459338695, "grad_norm": 0.19249345362186432, "learning_rate": 3.737258852798615e-05, "loss": 0.7926, "num_input_tokens_seen": 15555696, "step": 26970 }, { "epoch": 4.017724158474829, "grad_norm": 0.3340590000152588, "learning_rate": 3.736694095398585e-05, "loss": 0.8348, "num_input_tokens_seen": 15558608, "step": 26975 }, { "epoch": 4.018468871015788, "grad_norm": 0.2705462574958801, "learning_rate": 3.736129254427796e-05, "loss": 0.7943, "num_input_tokens_seen": 15561168, "step": 26980 }, { "epoch": 4.0192135835567475, "grad_norm": 0.2982538938522339, "learning_rate": 3.735564329924419e-05, "loss": 0.8493, "num_input_tokens_seen": 15564016, "step": 26985 }, { "epoch": 4.019958296097706, "grad_norm": 0.3072502613067627, "learning_rate": 3.734999321926626e-05, "loss": 0.7753, "num_input_tokens_seen": 15567088, "step": 26990 }, { "epoch": 4.020703008638666, "grad_norm": 0.247991144657135, "learning_rate": 3.7344342304726014e-05, "loss": 0.8055, "num_input_tokens_seen": 15570128, "step": 26995 }, { "epoch": 4.021447721179625, "grad_norm": 0.1739317625761032, "learning_rate": 3.73386905560053e-05, "loss": 0.8329, "num_input_tokens_seen": 15572912, "step": 27000 }, { "epoch": 4.022192433720583, "grad_norm": 0.3502131402492523, "learning_rate": 3.733303797348604e-05, "loss": 0.809, "num_input_tokens_seen": 15576080, "step": 27005 }, { "epoch": 4.022937146261543, "grad_norm": 0.2527438998222351, "learning_rate": 3.732738455755022e-05, "loss": 0.8102, "num_input_tokens_seen": 15579056, "step": 27010 }, { "epoch": 4.023681858802502, "grad_norm": 0.24587416648864746, "learning_rate": 3.732173030857987e-05, "loss": 0.8111, "num_input_tokens_seen": 15582064, "step": 27015 }, { "epoch": 4.0244265713434615, "grad_norm": 0.21348752081394196, "learning_rate": 3.731607522695709e-05, "loss": 0.8145, "num_input_tokens_seen": 15585072, "step": 27020 }, { "epoch": 4.02517128388442, "grad_norm": 0.2176952064037323, "learning_rate": 3.731041931306401e-05, "loss": 0.7903, "num_input_tokens_seen": 15588080, "step": 27025 }, { "epoch": 4.02591599642538, "grad_norm": 0.22721624374389648, "learning_rate": 3.730476256728284e-05, "loss": 0.8147, "num_input_tokens_seen": 15590864, "step": 27030 }, { "epoch": 4.026660708966339, "grad_norm": 0.22520899772644043, "learning_rate": 3.729910498999585e-05, "loss": 0.8006, "num_input_tokens_seen": 15593648, "step": 27035 }, { "epoch": 4.027405421507298, "grad_norm": 0.3003191351890564, "learning_rate": 3.729344658158535e-05, "loss": 0.7952, "num_input_tokens_seen": 15596720, "step": 27040 }, { "epoch": 4.028150134048257, "grad_norm": 0.22684462368488312, "learning_rate": 3.7287787342433706e-05, "loss": 0.8154, "num_input_tokens_seen": 15599920, "step": 27045 }, { "epoch": 4.028894846589217, "grad_norm": 0.22935231029987335, "learning_rate": 3.728212727292336e-05, "loss": 0.7803, "num_input_tokens_seen": 15603024, "step": 27050 }, { "epoch": 4.0296395591301755, "grad_norm": 0.27777475118637085, "learning_rate": 3.727646637343678e-05, "loss": 0.7849, "num_input_tokens_seen": 15606192, "step": 27055 }, { "epoch": 4.030384271671135, "grad_norm": 0.2555083632469177, "learning_rate": 3.727080464435652e-05, "loss": 0.8045, "num_input_tokens_seen": 15609136, "step": 27060 }, { "epoch": 4.031128984212094, "grad_norm": 0.2738226056098938, "learning_rate": 3.726514208606517e-05, "loss": 0.8202, "num_input_tokens_seen": 15611824, "step": 27065 }, { "epoch": 4.0318736967530535, "grad_norm": 0.41607850790023804, "learning_rate": 3.725947869894538e-05, "loss": 0.8199, "num_input_tokens_seen": 15614992, "step": 27070 }, { "epoch": 4.032618409294012, "grad_norm": 0.22991138696670532, "learning_rate": 3.725381448337987e-05, "loss": 0.8292, "num_input_tokens_seen": 15617616, "step": 27075 }, { "epoch": 4.033363121834972, "grad_norm": 0.33972275257110596, "learning_rate": 3.72481494397514e-05, "loss": 0.7965, "num_input_tokens_seen": 15620176, "step": 27080 }, { "epoch": 4.034107834375931, "grad_norm": 0.26495856046676636, "learning_rate": 3.724248356844278e-05, "loss": 0.7871, "num_input_tokens_seen": 15623024, "step": 27085 }, { "epoch": 4.03485254691689, "grad_norm": 0.2708165645599365, "learning_rate": 3.7236816869836896e-05, "loss": 0.8166, "num_input_tokens_seen": 15626448, "step": 27090 }, { "epoch": 4.035597259457849, "grad_norm": 0.16464656591415405, "learning_rate": 3.723114934431669e-05, "loss": 0.785, "num_input_tokens_seen": 15629360, "step": 27095 }, { "epoch": 4.036341971998809, "grad_norm": 0.2793932855129242, "learning_rate": 3.7225480992265125e-05, "loss": 0.8115, "num_input_tokens_seen": 15632272, "step": 27100 }, { "epoch": 4.0370866845397675, "grad_norm": 0.21478284895420074, "learning_rate": 3.721981181406526e-05, "loss": 0.7953, "num_input_tokens_seen": 15635408, "step": 27105 }, { "epoch": 4.037831397080727, "grad_norm": 0.24037882685661316, "learning_rate": 3.721414181010021e-05, "loss": 0.8191, "num_input_tokens_seen": 15638480, "step": 27110 }, { "epoch": 4.038576109621686, "grad_norm": 0.26152926683425903, "learning_rate": 3.72084709807531e-05, "loss": 0.8031, "num_input_tokens_seen": 15641392, "step": 27115 }, { "epoch": 4.0393208221626455, "grad_norm": 0.2954258322715759, "learning_rate": 3.720279932640717e-05, "loss": 0.7932, "num_input_tokens_seen": 15644464, "step": 27120 }, { "epoch": 4.040065534703604, "grad_norm": 0.20310325920581818, "learning_rate": 3.7197126847445664e-05, "loss": 0.8135, "num_input_tokens_seen": 15647280, "step": 27125 }, { "epoch": 4.040810247244564, "grad_norm": 0.23331640660762787, "learning_rate": 3.719145354425192e-05, "loss": 0.7862, "num_input_tokens_seen": 15650256, "step": 27130 }, { "epoch": 4.041554959785523, "grad_norm": 0.2020462304353714, "learning_rate": 3.718577941720931e-05, "loss": 0.8176, "num_input_tokens_seen": 15653232, "step": 27135 }, { "epoch": 4.042299672326482, "grad_norm": 0.2711718678474426, "learning_rate": 3.7180104466701274e-05, "loss": 0.8053, "num_input_tokens_seen": 15655952, "step": 27140 }, { "epoch": 4.043044384867441, "grad_norm": 0.3662258982658386, "learning_rate": 3.71744286931113e-05, "loss": 0.8321, "num_input_tokens_seen": 15658896, "step": 27145 }, { "epoch": 4.043789097408401, "grad_norm": 0.24739563465118408, "learning_rate": 3.7168752096822924e-05, "loss": 0.8031, "num_input_tokens_seen": 15661872, "step": 27150 }, { "epoch": 4.0445338099493595, "grad_norm": 0.26444634795188904, "learning_rate": 3.716307467821976e-05, "loss": 0.7896, "num_input_tokens_seen": 15664848, "step": 27155 }, { "epoch": 4.045278522490319, "grad_norm": 0.2757110893726349, "learning_rate": 3.7157396437685465e-05, "loss": 0.7955, "num_input_tokens_seen": 15667600, "step": 27160 }, { "epoch": 4.046023235031278, "grad_norm": 0.27468156814575195, "learning_rate": 3.715171737560374e-05, "loss": 0.79, "num_input_tokens_seen": 15670512, "step": 27165 }, { "epoch": 4.046767947572237, "grad_norm": 0.20891931653022766, "learning_rate": 3.7146037492358366e-05, "loss": 0.8221, "num_input_tokens_seen": 15673072, "step": 27170 }, { "epoch": 4.047512660113196, "grad_norm": 0.2810247242450714, "learning_rate": 3.714035678833316e-05, "loss": 0.7968, "num_input_tokens_seen": 15675728, "step": 27175 }, { "epoch": 4.048257372654155, "grad_norm": 0.17716073989868164, "learning_rate": 3.7134675263912e-05, "loss": 0.8078, "num_input_tokens_seen": 15678320, "step": 27180 }, { "epoch": 4.049002085195115, "grad_norm": 0.23989228904247284, "learning_rate": 3.712899291947882e-05, "loss": 0.8073, "num_input_tokens_seen": 15681360, "step": 27185 }, { "epoch": 4.0497467977360735, "grad_norm": 0.3047168552875519, "learning_rate": 3.7123309755417615e-05, "loss": 0.8016, "num_input_tokens_seen": 15684208, "step": 27190 }, { "epoch": 4.050491510277033, "grad_norm": 0.4326563775539398, "learning_rate": 3.7117625772112416e-05, "loss": 0.8327, "num_input_tokens_seen": 15687632, "step": 27195 }, { "epoch": 4.051236222817992, "grad_norm": 0.260797917842865, "learning_rate": 3.711194096994736e-05, "loss": 0.7952, "num_input_tokens_seen": 15690352, "step": 27200 }, { "epoch": 4.0519809353589515, "grad_norm": 0.2705935537815094, "learning_rate": 3.710625534930655e-05, "loss": 0.8004, "num_input_tokens_seen": 15693040, "step": 27205 }, { "epoch": 4.05272564789991, "grad_norm": 0.3029349148273468, "learning_rate": 3.710056891057423e-05, "loss": 0.8082, "num_input_tokens_seen": 15696400, "step": 27210 }, { "epoch": 4.05347036044087, "grad_norm": 0.3679637312889099, "learning_rate": 3.709488165413467e-05, "loss": 0.8295, "num_input_tokens_seen": 15699568, "step": 27215 }, { "epoch": 4.054215072981829, "grad_norm": 0.23126448690891266, "learning_rate": 3.708919358037218e-05, "loss": 0.8046, "num_input_tokens_seen": 15702224, "step": 27220 }, { "epoch": 4.054959785522788, "grad_norm": 0.21275107562541962, "learning_rate": 3.708350468967113e-05, "loss": 0.7907, "num_input_tokens_seen": 15705360, "step": 27225 }, { "epoch": 4.055704498063747, "grad_norm": 0.24338282644748688, "learning_rate": 3.707781498241596e-05, "loss": 0.8043, "num_input_tokens_seen": 15708304, "step": 27230 }, { "epoch": 4.056449210604707, "grad_norm": 0.2525315284729004, "learning_rate": 3.707212445899116e-05, "loss": 0.8119, "num_input_tokens_seen": 15711152, "step": 27235 }, { "epoch": 4.0571939231456655, "grad_norm": 0.2396879345178604, "learning_rate": 3.7066433119781286e-05, "loss": 0.7726, "num_input_tokens_seen": 15714000, "step": 27240 }, { "epoch": 4.057938635686625, "grad_norm": 0.182062566280365, "learning_rate": 3.70607409651709e-05, "loss": 0.7979, "num_input_tokens_seen": 15716656, "step": 27245 }, { "epoch": 4.058683348227584, "grad_norm": 0.28725719451904297, "learning_rate": 3.705504799554469e-05, "loss": 0.8364, "num_input_tokens_seen": 15719440, "step": 27250 }, { "epoch": 4.059428060768544, "grad_norm": 0.18442875146865845, "learning_rate": 3.704935421128734e-05, "loss": 0.8008, "num_input_tokens_seen": 15722288, "step": 27255 }, { "epoch": 4.060172773309502, "grad_norm": 0.24476361274719238, "learning_rate": 3.704365961278363e-05, "loss": 0.817, "num_input_tokens_seen": 15725072, "step": 27260 }, { "epoch": 4.060917485850462, "grad_norm": 0.18387334048748016, "learning_rate": 3.7037964200418365e-05, "loss": 0.7797, "num_input_tokens_seen": 15727920, "step": 27265 }, { "epoch": 4.061662198391421, "grad_norm": 0.2761494219303131, "learning_rate": 3.7032267974576415e-05, "loss": 0.789, "num_input_tokens_seen": 15730736, "step": 27270 }, { "epoch": 4.06240691093238, "grad_norm": 0.26122409105300903, "learning_rate": 3.702657093564272e-05, "loss": 0.7818, "num_input_tokens_seen": 15733552, "step": 27275 }, { "epoch": 4.063151623473339, "grad_norm": 0.23138415813446045, "learning_rate": 3.702087308400226e-05, "loss": 0.8032, "num_input_tokens_seen": 15736464, "step": 27280 }, { "epoch": 4.063896336014299, "grad_norm": 0.22641879320144653, "learning_rate": 3.7015174420040074e-05, "loss": 0.7777, "num_input_tokens_seen": 15739312, "step": 27285 }, { "epoch": 4.0646410485552575, "grad_norm": 0.24150171875953674, "learning_rate": 3.7009474944141244e-05, "loss": 0.8165, "num_input_tokens_seen": 15742672, "step": 27290 }, { "epoch": 4.065385761096217, "grad_norm": 0.17549081146717072, "learning_rate": 3.7003774656690924e-05, "loss": 0.8033, "num_input_tokens_seen": 15745488, "step": 27295 }, { "epoch": 4.066130473637176, "grad_norm": 0.24151015281677246, "learning_rate": 3.699807355807432e-05, "loss": 0.7778, "num_input_tokens_seen": 15748240, "step": 27300 }, { "epoch": 4.066875186178136, "grad_norm": 0.23779022693634033, "learning_rate": 3.6992371648676685e-05, "loss": 0.761, "num_input_tokens_seen": 15751088, "step": 27305 }, { "epoch": 4.067619898719094, "grad_norm": 0.23511503636837006, "learning_rate": 3.698666892888332e-05, "loss": 0.8334, "num_input_tokens_seen": 15754192, "step": 27310 }, { "epoch": 4.068364611260054, "grad_norm": 0.21809570491313934, "learning_rate": 3.698096539907962e-05, "loss": 0.8082, "num_input_tokens_seen": 15756976, "step": 27315 }, { "epoch": 4.069109323801013, "grad_norm": 0.25879210233688354, "learning_rate": 3.6975261059650986e-05, "loss": 0.8299, "num_input_tokens_seen": 15760144, "step": 27320 }, { "epoch": 4.069854036341972, "grad_norm": 0.28649666905403137, "learning_rate": 3.696955591098289e-05, "loss": 0.8379, "num_input_tokens_seen": 15763056, "step": 27325 }, { "epoch": 4.070598748882931, "grad_norm": 0.19484426081180573, "learning_rate": 3.696384995346087e-05, "loss": 0.7579, "num_input_tokens_seen": 15765680, "step": 27330 }, { "epoch": 4.071343461423891, "grad_norm": 0.2332654893398285, "learning_rate": 3.6958143187470514e-05, "loss": 0.8654, "num_input_tokens_seen": 15768368, "step": 27335 }, { "epoch": 4.07208817396485, "grad_norm": 0.1613553911447525, "learning_rate": 3.695243561339747e-05, "loss": 0.7489, "num_input_tokens_seen": 15771472, "step": 27340 }, { "epoch": 4.072832886505808, "grad_norm": 0.305787056684494, "learning_rate": 3.694672723162741e-05, "loss": 0.82, "num_input_tokens_seen": 15774448, "step": 27345 }, { "epoch": 4.073577599046768, "grad_norm": 0.17094452679157257, "learning_rate": 3.69410180425461e-05, "loss": 0.8089, "num_input_tokens_seen": 15777392, "step": 27350 }, { "epoch": 4.074322311587727, "grad_norm": 0.18656785786151886, "learning_rate": 3.693530804653934e-05, "loss": 0.7961, "num_input_tokens_seen": 15780080, "step": 27355 }, { "epoch": 4.075067024128686, "grad_norm": 0.1933479756116867, "learning_rate": 3.692959724399299e-05, "loss": 0.8138, "num_input_tokens_seen": 15782992, "step": 27360 }, { "epoch": 4.075811736669645, "grad_norm": 0.30328524112701416, "learning_rate": 3.692388563529295e-05, "loss": 0.8247, "num_input_tokens_seen": 15786032, "step": 27365 }, { "epoch": 4.076556449210605, "grad_norm": 0.2098522186279297, "learning_rate": 3.6918173220825204e-05, "loss": 0.7929, "num_input_tokens_seen": 15788976, "step": 27370 }, { "epoch": 4.0773011617515635, "grad_norm": 0.24899989366531372, "learning_rate": 3.691246000097577e-05, "loss": 0.8085, "num_input_tokens_seen": 15791696, "step": 27375 }, { "epoch": 4.078045874292523, "grad_norm": 0.25636327266693115, "learning_rate": 3.6906745976130716e-05, "loss": 0.7974, "num_input_tokens_seen": 15794480, "step": 27380 }, { "epoch": 4.078790586833482, "grad_norm": 0.2436840534210205, "learning_rate": 3.6901031146676185e-05, "loss": 0.8052, "num_input_tokens_seen": 15797136, "step": 27385 }, { "epoch": 4.079535299374442, "grad_norm": 0.25846701860427856, "learning_rate": 3.689531551299835e-05, "loss": 0.8026, "num_input_tokens_seen": 15799696, "step": 27390 }, { "epoch": 4.0802800119154, "grad_norm": 0.20986701548099518, "learning_rate": 3.688959907548346e-05, "loss": 0.769, "num_input_tokens_seen": 15802832, "step": 27395 }, { "epoch": 4.08102472445636, "grad_norm": 0.1878247857093811, "learning_rate": 3.68838818345178e-05, "loss": 0.8016, "num_input_tokens_seen": 15805456, "step": 27400 }, { "epoch": 4.081769436997319, "grad_norm": 0.2294740378856659, "learning_rate": 3.6878163790487726e-05, "loss": 0.8054, "num_input_tokens_seen": 15808464, "step": 27405 }, { "epoch": 4.082514149538278, "grad_norm": 0.2236141562461853, "learning_rate": 3.6872444943779624e-05, "loss": 0.7891, "num_input_tokens_seen": 15811184, "step": 27410 }, { "epoch": 4.083258862079237, "grad_norm": 0.2080782651901245, "learning_rate": 3.686672529477998e-05, "loss": 0.8044, "num_input_tokens_seen": 15814096, "step": 27415 }, { "epoch": 4.084003574620197, "grad_norm": 0.3119044899940491, "learning_rate": 3.686100484387528e-05, "loss": 0.7808, "num_input_tokens_seen": 15817232, "step": 27420 }, { "epoch": 4.084748287161156, "grad_norm": 0.20810814201831818, "learning_rate": 3.685528359145209e-05, "loss": 0.7828, "num_input_tokens_seen": 15820208, "step": 27425 }, { "epoch": 4.085492999702115, "grad_norm": 0.1845817118883133, "learning_rate": 3.6849561537897045e-05, "loss": 0.8069, "num_input_tokens_seen": 15822864, "step": 27430 }, { "epoch": 4.086237712243074, "grad_norm": 0.21529026329517365, "learning_rate": 3.684383868359681e-05, "loss": 0.817, "num_input_tokens_seen": 15826256, "step": 27435 }, { "epoch": 4.086982424784034, "grad_norm": 0.1972876638174057, "learning_rate": 3.68381150289381e-05, "loss": 0.8096, "num_input_tokens_seen": 15828880, "step": 27440 }, { "epoch": 4.087727137324992, "grad_norm": 0.21905210614204407, "learning_rate": 3.683239057430771e-05, "loss": 0.7784, "num_input_tokens_seen": 15831760, "step": 27445 }, { "epoch": 4.088471849865952, "grad_norm": 0.19630815088748932, "learning_rate": 3.6826665320092465e-05, "loss": 0.7943, "num_input_tokens_seen": 15834992, "step": 27450 }, { "epoch": 4.089216562406911, "grad_norm": 0.21002978086471558, "learning_rate": 3.682093926667927e-05, "loss": 0.8234, "num_input_tokens_seen": 15837808, "step": 27455 }, { "epoch": 4.08996127494787, "grad_norm": 0.18844833970069885, "learning_rate": 3.681521241445506e-05, "loss": 0.7965, "num_input_tokens_seen": 15840528, "step": 27460 }, { "epoch": 4.090705987488829, "grad_norm": 0.2532957196235657, "learning_rate": 3.6809484763806834e-05, "loss": 0.8087, "num_input_tokens_seen": 15843120, "step": 27465 }, { "epoch": 4.091450700029789, "grad_norm": 0.22159330546855927, "learning_rate": 3.680375631512164e-05, "loss": 0.8092, "num_input_tokens_seen": 15845840, "step": 27470 }, { "epoch": 4.092195412570748, "grad_norm": 0.36682483553886414, "learning_rate": 3.679802706878658e-05, "loss": 0.8625, "num_input_tokens_seen": 15848720, "step": 27475 }, { "epoch": 4.092940125111707, "grad_norm": 0.19844931364059448, "learning_rate": 3.6792297025188824e-05, "loss": 0.8256, "num_input_tokens_seen": 15851504, "step": 27480 }, { "epoch": 4.093684837652666, "grad_norm": 0.21460948884487152, "learning_rate": 3.6786566184715576e-05, "loss": 0.7965, "num_input_tokens_seen": 15854256, "step": 27485 }, { "epoch": 4.094429550193626, "grad_norm": 0.2302243709564209, "learning_rate": 3.67808345477541e-05, "loss": 0.8032, "num_input_tokens_seen": 15857168, "step": 27490 }, { "epoch": 4.095174262734584, "grad_norm": 0.2605369985103607, "learning_rate": 3.6775102114691736e-05, "loss": 0.7919, "num_input_tokens_seen": 15859984, "step": 27495 }, { "epoch": 4.095918975275544, "grad_norm": 0.22308023273944855, "learning_rate": 3.676936888591583e-05, "loss": 0.794, "num_input_tokens_seen": 15862800, "step": 27500 }, { "epoch": 4.096663687816503, "grad_norm": 0.30022454261779785, "learning_rate": 3.6763634861813836e-05, "loss": 0.8097, "num_input_tokens_seen": 15865584, "step": 27505 }, { "epoch": 4.0974084003574625, "grad_norm": 0.20341405272483826, "learning_rate": 3.675790004277322e-05, "loss": 0.8118, "num_input_tokens_seen": 15868560, "step": 27510 }, { "epoch": 4.098153112898421, "grad_norm": 0.2780657112598419, "learning_rate": 3.675216442918153e-05, "loss": 0.7863, "num_input_tokens_seen": 15871280, "step": 27515 }, { "epoch": 4.09889782543938, "grad_norm": 0.20635204017162323, "learning_rate": 3.674642802142635e-05, "loss": 0.8116, "num_input_tokens_seen": 15874448, "step": 27520 }, { "epoch": 4.09964253798034, "grad_norm": 0.2730596661567688, "learning_rate": 3.6740690819895304e-05, "loss": 0.7878, "num_input_tokens_seen": 15877456, "step": 27525 }, { "epoch": 4.100387250521298, "grad_norm": 0.23662874102592468, "learning_rate": 3.673495282497613e-05, "loss": 0.7705, "num_input_tokens_seen": 15880464, "step": 27530 }, { "epoch": 4.101131963062258, "grad_norm": 0.2008260190486908, "learning_rate": 3.672921403705654e-05, "loss": 0.8043, "num_input_tokens_seen": 15883408, "step": 27535 }, { "epoch": 4.101876675603217, "grad_norm": 0.2458835244178772, "learning_rate": 3.672347445652436e-05, "loss": 0.8342, "num_input_tokens_seen": 15886416, "step": 27540 }, { "epoch": 4.102621388144176, "grad_norm": 0.22893132269382477, "learning_rate": 3.671773408376743e-05, "loss": 0.7923, "num_input_tokens_seen": 15889008, "step": 27545 }, { "epoch": 4.103366100685135, "grad_norm": 0.25231194496154785, "learning_rate": 3.671199291917368e-05, "loss": 0.82, "num_input_tokens_seen": 15891824, "step": 27550 }, { "epoch": 4.104110813226095, "grad_norm": 0.2003728300333023, "learning_rate": 3.6706250963131065e-05, "loss": 0.7683, "num_input_tokens_seen": 15894416, "step": 27555 }, { "epoch": 4.104855525767054, "grad_norm": 0.3228144943714142, "learning_rate": 3.670050821602761e-05, "loss": 0.8363, "num_input_tokens_seen": 15897232, "step": 27560 }, { "epoch": 4.105600238308013, "grad_norm": 0.2192641794681549, "learning_rate": 3.669476467825137e-05, "loss": 0.7926, "num_input_tokens_seen": 15900240, "step": 27565 }, { "epoch": 4.106344950848972, "grad_norm": 0.2956985831260681, "learning_rate": 3.668902035019049e-05, "loss": 0.7958, "num_input_tokens_seen": 15903600, "step": 27570 }, { "epoch": 4.107089663389932, "grad_norm": 0.18314461410045624, "learning_rate": 3.668327523223313e-05, "loss": 0.7806, "num_input_tokens_seen": 15906448, "step": 27575 }, { "epoch": 4.10783437593089, "grad_norm": 0.17600186169147491, "learning_rate": 3.667752932476753e-05, "loss": 0.7918, "num_input_tokens_seen": 15909040, "step": 27580 }, { "epoch": 4.10857908847185, "grad_norm": 0.24359171092510223, "learning_rate": 3.667178262818198e-05, "loss": 0.8714, "num_input_tokens_seen": 15912080, "step": 27585 }, { "epoch": 4.109323801012809, "grad_norm": 0.24441155791282654, "learning_rate": 3.666603514286482e-05, "loss": 0.8098, "num_input_tokens_seen": 15915024, "step": 27590 }, { "epoch": 4.1100685135537685, "grad_norm": 0.23811127245426178, "learning_rate": 3.666028686920443e-05, "loss": 0.7871, "num_input_tokens_seen": 15917776, "step": 27595 }, { "epoch": 4.110813226094727, "grad_norm": 0.27860525250434875, "learning_rate": 3.665453780758926e-05, "loss": 0.8409, "num_input_tokens_seen": 15920624, "step": 27600 }, { "epoch": 4.111557938635687, "grad_norm": 0.4225751459598541, "learning_rate": 3.6648787958407803e-05, "loss": 0.8038, "num_input_tokens_seen": 15923536, "step": 27605 }, { "epoch": 4.112302651176646, "grad_norm": 0.1921469271183014, "learning_rate": 3.6643037322048624e-05, "loss": 0.7901, "num_input_tokens_seen": 15926672, "step": 27610 }, { "epoch": 4.113047363717605, "grad_norm": 0.2762244641780853, "learning_rate": 3.663728589890032e-05, "loss": 0.8082, "num_input_tokens_seen": 15929936, "step": 27615 }, { "epoch": 4.113792076258564, "grad_norm": 0.2338923066854477, "learning_rate": 3.6631533689351544e-05, "loss": 0.7779, "num_input_tokens_seen": 15932656, "step": 27620 }, { "epoch": 4.114536788799524, "grad_norm": 0.2606368660926819, "learning_rate": 3.6625780693791016e-05, "loss": 0.7945, "num_input_tokens_seen": 15935568, "step": 27625 }, { "epoch": 4.115281501340482, "grad_norm": 0.26785340905189514, "learning_rate": 3.6620026912607497e-05, "loss": 0.8007, "num_input_tokens_seen": 15938288, "step": 27630 }, { "epoch": 4.116026213881442, "grad_norm": 0.2564369738101959, "learning_rate": 3.6614272346189795e-05, "loss": 0.8186, "num_input_tokens_seen": 15941136, "step": 27635 }, { "epoch": 4.116770926422401, "grad_norm": 0.2544606029987335, "learning_rate": 3.660851699492679e-05, "loss": 0.7811, "num_input_tokens_seen": 15943920, "step": 27640 }, { "epoch": 4.1175156389633605, "grad_norm": 0.2497386783361435, "learning_rate": 3.660276085920742e-05, "loss": 0.7956, "num_input_tokens_seen": 15946736, "step": 27645 }, { "epoch": 4.118260351504319, "grad_norm": 0.20489220321178436, "learning_rate": 3.6597003939420623e-05, "loss": 0.7903, "num_input_tokens_seen": 15949616, "step": 27650 }, { "epoch": 4.119005064045279, "grad_norm": 0.2902090847492218, "learning_rate": 3.6591246235955456e-05, "loss": 0.8057, "num_input_tokens_seen": 15952272, "step": 27655 }, { "epoch": 4.119749776586238, "grad_norm": 0.24897457659244537, "learning_rate": 3.6585487749200996e-05, "loss": 0.8098, "num_input_tokens_seen": 15955408, "step": 27660 }, { "epoch": 4.120494489127197, "grad_norm": 0.19310347735881805, "learning_rate": 3.657972847954638e-05, "loss": 0.7951, "num_input_tokens_seen": 15958384, "step": 27665 }, { "epoch": 4.121239201668156, "grad_norm": 0.2849579453468323, "learning_rate": 3.657396842738079e-05, "loss": 0.8013, "num_input_tokens_seen": 15961232, "step": 27670 }, { "epoch": 4.121983914209116, "grad_norm": 0.1768151968717575, "learning_rate": 3.6568207593093465e-05, "loss": 0.8334, "num_input_tokens_seen": 15963984, "step": 27675 }, { "epoch": 4.1227286267500745, "grad_norm": 0.23173999786376953, "learning_rate": 3.656244597707372e-05, "loss": 0.8106, "num_input_tokens_seen": 15966800, "step": 27680 }, { "epoch": 4.123473339291033, "grad_norm": 0.1810513734817505, "learning_rate": 3.655668357971087e-05, "loss": 0.7981, "num_input_tokens_seen": 15969392, "step": 27685 }, { "epoch": 4.124218051831993, "grad_norm": 0.15128697454929352, "learning_rate": 3.6550920401394335e-05, "loss": 0.7685, "num_input_tokens_seen": 15972304, "step": 27690 }, { "epoch": 4.124962764372952, "grad_norm": 0.24066360294818878, "learning_rate": 3.654515644251356e-05, "loss": 0.8183, "num_input_tokens_seen": 15975056, "step": 27695 }, { "epoch": 4.125707476913911, "grad_norm": 0.263964980840683, "learning_rate": 3.653939170345805e-05, "loss": 0.7856, "num_input_tokens_seen": 15977712, "step": 27700 }, { "epoch": 4.12645218945487, "grad_norm": 0.242279514670372, "learning_rate": 3.653362618461737e-05, "loss": 0.8445, "num_input_tokens_seen": 15980432, "step": 27705 }, { "epoch": 4.12719690199583, "grad_norm": 0.23411040008068085, "learning_rate": 3.652785988638112e-05, "loss": 0.798, "num_input_tokens_seen": 15983152, "step": 27710 }, { "epoch": 4.127941614536788, "grad_norm": 0.24814127385616302, "learning_rate": 3.6522092809138975e-05, "loss": 0.7823, "num_input_tokens_seen": 15985744, "step": 27715 }, { "epoch": 4.128686327077748, "grad_norm": 0.2935878336429596, "learning_rate": 3.651632495328064e-05, "loss": 0.7911, "num_input_tokens_seen": 15988464, "step": 27720 }, { "epoch": 4.129431039618707, "grad_norm": 0.2300173044204712, "learning_rate": 3.6510556319195884e-05, "loss": 0.8326, "num_input_tokens_seen": 15991344, "step": 27725 }, { "epoch": 4.1301757521596665, "grad_norm": 0.2225097417831421, "learning_rate": 3.650478690727454e-05, "loss": 0.8006, "num_input_tokens_seen": 15993968, "step": 27730 }, { "epoch": 4.130920464700625, "grad_norm": 0.17746715247631073, "learning_rate": 3.6499016717906455e-05, "loss": 0.7894, "num_input_tokens_seen": 15996848, "step": 27735 }, { "epoch": 4.131665177241585, "grad_norm": 0.3035048842430115, "learning_rate": 3.6493245751481574e-05, "loss": 0.8327, "num_input_tokens_seen": 15999856, "step": 27740 }, { "epoch": 4.132409889782544, "grad_norm": 0.1958421915769577, "learning_rate": 3.648747400838989e-05, "loss": 0.8075, "num_input_tokens_seen": 16002768, "step": 27745 }, { "epoch": 4.133154602323503, "grad_norm": 0.2567218840122223, "learning_rate": 3.6481701489021404e-05, "loss": 0.8044, "num_input_tokens_seen": 16005552, "step": 27750 }, { "epoch": 4.133899314864462, "grad_norm": 0.17689555883407593, "learning_rate": 3.647592819376621e-05, "loss": 0.7756, "num_input_tokens_seen": 16008464, "step": 27755 }, { "epoch": 4.134644027405422, "grad_norm": 0.3108327388763428, "learning_rate": 3.6470154123014455e-05, "loss": 0.7918, "num_input_tokens_seen": 16011184, "step": 27760 }, { "epoch": 4.1353887399463805, "grad_norm": 0.24067582190036774, "learning_rate": 3.646437927715632e-05, "loss": 0.7795, "num_input_tokens_seen": 16013872, "step": 27765 }, { "epoch": 4.13613345248734, "grad_norm": 0.20164158940315247, "learning_rate": 3.645860365658203e-05, "loss": 0.8461, "num_input_tokens_seen": 16016560, "step": 27770 }, { "epoch": 4.136878165028299, "grad_norm": 0.25698867440223694, "learning_rate": 3.645282726168191e-05, "loss": 0.7883, "num_input_tokens_seen": 16019568, "step": 27775 }, { "epoch": 4.1376228775692585, "grad_norm": 0.30784210562705994, "learning_rate": 3.644705009284628e-05, "loss": 0.8104, "num_input_tokens_seen": 16022576, "step": 27780 }, { "epoch": 4.138367590110217, "grad_norm": 0.2689281702041626, "learning_rate": 3.644127215046555e-05, "loss": 0.7819, "num_input_tokens_seen": 16025360, "step": 27785 }, { "epoch": 4.139112302651177, "grad_norm": 0.2600044012069702, "learning_rate": 3.643549343493015e-05, "loss": 0.8124, "num_input_tokens_seen": 16028240, "step": 27790 }, { "epoch": 4.139857015192136, "grad_norm": 0.2633821666240692, "learning_rate": 3.642971394663061e-05, "loss": 0.7921, "num_input_tokens_seen": 16031408, "step": 27795 }, { "epoch": 4.140601727733095, "grad_norm": 0.34064731001853943, "learning_rate": 3.642393368595747e-05, "loss": 0.8428, "num_input_tokens_seen": 16034192, "step": 27800 }, { "epoch": 4.141346440274054, "grad_norm": 0.29242730140686035, "learning_rate": 3.641815265330133e-05, "loss": 0.821, "num_input_tokens_seen": 16036880, "step": 27805 }, { "epoch": 4.142091152815014, "grad_norm": 0.315560907125473, "learning_rate": 3.6412370849052865e-05, "loss": 0.7881, "num_input_tokens_seen": 16039728, "step": 27810 }, { "epoch": 4.1428358653559725, "grad_norm": 0.22451384365558624, "learning_rate": 3.6406588273602774e-05, "loss": 0.7955, "num_input_tokens_seen": 16042512, "step": 27815 }, { "epoch": 4.143580577896932, "grad_norm": 0.30689895153045654, "learning_rate": 3.640080492734182e-05, "loss": 0.7989, "num_input_tokens_seen": 16045424, "step": 27820 }, { "epoch": 4.144325290437891, "grad_norm": 0.2249312847852707, "learning_rate": 3.639502081066083e-05, "loss": 0.7976, "num_input_tokens_seen": 16048112, "step": 27825 }, { "epoch": 4.1450700029788505, "grad_norm": 0.2149430364370346, "learning_rate": 3.638923592395066e-05, "loss": 0.8294, "num_input_tokens_seen": 16050960, "step": 27830 }, { "epoch": 4.145814715519809, "grad_norm": 0.22774973511695862, "learning_rate": 3.638345026760222e-05, "loss": 0.7977, "num_input_tokens_seen": 16053936, "step": 27835 }, { "epoch": 4.146559428060769, "grad_norm": 0.162056565284729, "learning_rate": 3.63776638420065e-05, "loss": 0.8165, "num_input_tokens_seen": 16056528, "step": 27840 }, { "epoch": 4.147304140601728, "grad_norm": 0.297650009393692, "learning_rate": 3.6371876647554524e-05, "loss": 0.7721, "num_input_tokens_seen": 16059408, "step": 27845 }, { "epoch": 4.148048853142687, "grad_norm": 0.18111170828342438, "learning_rate": 3.636608868463735e-05, "loss": 0.7866, "num_input_tokens_seen": 16062288, "step": 27850 }, { "epoch": 4.148793565683646, "grad_norm": 0.2405393272638321, "learning_rate": 3.636029995364611e-05, "loss": 0.7854, "num_input_tokens_seen": 16065200, "step": 27855 }, { "epoch": 4.149538278224606, "grad_norm": 0.2753698527812958, "learning_rate": 3.6354510454972e-05, "loss": 0.7895, "num_input_tokens_seen": 16068240, "step": 27860 }, { "epoch": 4.1502829907655645, "grad_norm": 0.22256696224212646, "learning_rate": 3.634872018900623e-05, "loss": 0.8067, "num_input_tokens_seen": 16071024, "step": 27865 }, { "epoch": 4.151027703306523, "grad_norm": 0.24224787950515747, "learning_rate": 3.634292915614009e-05, "loss": 0.8307, "num_input_tokens_seen": 16073936, "step": 27870 }, { "epoch": 4.151772415847483, "grad_norm": 0.1880723088979721, "learning_rate": 3.633713735676491e-05, "loss": 0.8057, "num_input_tokens_seen": 16076816, "step": 27875 }, { "epoch": 4.152517128388442, "grad_norm": 0.29975655674934387, "learning_rate": 3.6331344791272087e-05, "loss": 0.8212, "num_input_tokens_seen": 16079696, "step": 27880 }, { "epoch": 4.153261840929401, "grad_norm": 0.19746069610118866, "learning_rate": 3.632555146005305e-05, "loss": 0.8239, "num_input_tokens_seen": 16082928, "step": 27885 }, { "epoch": 4.15400655347036, "grad_norm": 0.20912706851959229, "learning_rate": 3.63197573634993e-05, "loss": 0.7915, "num_input_tokens_seen": 16085936, "step": 27890 }, { "epoch": 4.15475126601132, "grad_norm": 0.2814314365386963, "learning_rate": 3.6313962502002365e-05, "loss": 0.8009, "num_input_tokens_seen": 16088944, "step": 27895 }, { "epoch": 4.1554959785522785, "grad_norm": 0.33896249532699585, "learning_rate": 3.6308166875953836e-05, "loss": 0.813, "num_input_tokens_seen": 16091984, "step": 27900 }, { "epoch": 4.156240691093238, "grad_norm": 0.2205326110124588, "learning_rate": 3.630237048574537e-05, "loss": 0.7611, "num_input_tokens_seen": 16095056, "step": 27905 }, { "epoch": 4.156985403634197, "grad_norm": 0.20330283045768738, "learning_rate": 3.6296573331768664e-05, "loss": 0.8124, "num_input_tokens_seen": 16097840, "step": 27910 }, { "epoch": 4.1577301161751565, "grad_norm": 0.2597186863422394, "learning_rate": 3.629077541441546e-05, "loss": 0.791, "num_input_tokens_seen": 16100528, "step": 27915 }, { "epoch": 4.158474828716115, "grad_norm": 0.3014645576477051, "learning_rate": 3.628497673407755e-05, "loss": 0.7936, "num_input_tokens_seen": 16103344, "step": 27920 }, { "epoch": 4.159219541257075, "grad_norm": 0.2070109248161316, "learning_rate": 3.62791772911468e-05, "loss": 0.8085, "num_input_tokens_seen": 16106064, "step": 27925 }, { "epoch": 4.159964253798034, "grad_norm": 0.26446837186813354, "learning_rate": 3.6273377086015106e-05, "loss": 0.7745, "num_input_tokens_seen": 16108752, "step": 27930 }, { "epoch": 4.160708966338993, "grad_norm": 0.2939153015613556, "learning_rate": 3.626757611907442e-05, "loss": 0.8353, "num_input_tokens_seen": 16111408, "step": 27935 }, { "epoch": 4.161453678879952, "grad_norm": 0.345792293548584, "learning_rate": 3.6261774390716744e-05, "loss": 0.785, "num_input_tokens_seen": 16114384, "step": 27940 }, { "epoch": 4.162198391420912, "grad_norm": 0.21092697978019714, "learning_rate": 3.625597190133416e-05, "loss": 0.7893, "num_input_tokens_seen": 16117328, "step": 27945 }, { "epoch": 4.1629431039618705, "grad_norm": 0.23475421965122223, "learning_rate": 3.625016865131875e-05, "loss": 0.7987, "num_input_tokens_seen": 16120336, "step": 27950 }, { "epoch": 4.16368781650283, "grad_norm": 0.2985266447067261, "learning_rate": 3.624436464106267e-05, "loss": 0.7871, "num_input_tokens_seen": 16123120, "step": 27955 }, { "epoch": 4.164432529043789, "grad_norm": 0.3158474564552307, "learning_rate": 3.623855987095816e-05, "loss": 0.7777, "num_input_tokens_seen": 16126000, "step": 27960 }, { "epoch": 4.165177241584749, "grad_norm": 0.3122048079967499, "learning_rate": 3.623275434139746e-05, "loss": 0.7948, "num_input_tokens_seen": 16129072, "step": 27965 }, { "epoch": 4.165921954125707, "grad_norm": 0.22139635682106018, "learning_rate": 3.622694805277289e-05, "loss": 0.7856, "num_input_tokens_seen": 16131632, "step": 27970 }, { "epoch": 4.166666666666667, "grad_norm": 0.2668023109436035, "learning_rate": 3.6221141005476824e-05, "loss": 0.7731, "num_input_tokens_seen": 16134768, "step": 27975 }, { "epoch": 4.167411379207626, "grad_norm": 0.26012125611305237, "learning_rate": 3.6215333199901655e-05, "loss": 0.7975, "num_input_tokens_seen": 16137712, "step": 27980 }, { "epoch": 4.168156091748585, "grad_norm": 0.20189721882343292, "learning_rate": 3.620952463643989e-05, "loss": 0.8073, "num_input_tokens_seen": 16140560, "step": 27985 }, { "epoch": 4.168900804289544, "grad_norm": 0.25014352798461914, "learning_rate": 3.6203715315484e-05, "loss": 0.7839, "num_input_tokens_seen": 16143856, "step": 27990 }, { "epoch": 4.169645516830504, "grad_norm": 0.20223042368888855, "learning_rate": 3.6197905237426596e-05, "loss": 0.7646, "num_input_tokens_seen": 16146640, "step": 27995 }, { "epoch": 4.1703902293714625, "grad_norm": 0.22550341486930847, "learning_rate": 3.619209440266027e-05, "loss": 0.8255, "num_input_tokens_seen": 16149712, "step": 28000 }, { "epoch": 4.171134941912422, "grad_norm": 0.2537159025669098, "learning_rate": 3.618628281157772e-05, "loss": 0.8086, "num_input_tokens_seen": 16152400, "step": 28005 }, { "epoch": 4.171879654453381, "grad_norm": 0.28045445680618286, "learning_rate": 3.618047046457166e-05, "loss": 0.7868, "num_input_tokens_seen": 16155536, "step": 28010 }, { "epoch": 4.172624366994341, "grad_norm": 0.17147938907146454, "learning_rate": 3.617465736203485e-05, "loss": 0.8055, "num_input_tokens_seen": 16158480, "step": 28015 }, { "epoch": 4.173369079535299, "grad_norm": 0.23284491896629333, "learning_rate": 3.616884350436013e-05, "loss": 0.7801, "num_input_tokens_seen": 16161296, "step": 28020 }, { "epoch": 4.174113792076259, "grad_norm": 0.16717863082885742, "learning_rate": 3.616302889194039e-05, "loss": 0.8219, "num_input_tokens_seen": 16164400, "step": 28025 }, { "epoch": 4.174858504617218, "grad_norm": 0.21385608613491058, "learning_rate": 3.6157213525168534e-05, "loss": 0.7785, "num_input_tokens_seen": 16167408, "step": 28030 }, { "epoch": 4.1756032171581765, "grad_norm": 0.24160636961460114, "learning_rate": 3.6151397404437544e-05, "loss": 0.7956, "num_input_tokens_seen": 16170032, "step": 28035 }, { "epoch": 4.176347929699136, "grad_norm": 0.23224186897277832, "learning_rate": 3.614558053014045e-05, "loss": 0.836, "num_input_tokens_seen": 16172624, "step": 28040 }, { "epoch": 4.177092642240095, "grad_norm": 0.255581259727478, "learning_rate": 3.613976290267036e-05, "loss": 0.876, "num_input_tokens_seen": 16175184, "step": 28045 }, { "epoch": 4.177837354781055, "grad_norm": 0.19997218251228333, "learning_rate": 3.6133944522420374e-05, "loss": 0.8039, "num_input_tokens_seen": 16178224, "step": 28050 }, { "epoch": 4.178582067322013, "grad_norm": 0.3260820806026459, "learning_rate": 3.612812538978368e-05, "loss": 0.7673, "num_input_tokens_seen": 16181456, "step": 28055 }, { "epoch": 4.179326779862973, "grad_norm": 0.18079884350299835, "learning_rate": 3.612230550515352e-05, "loss": 0.8394, "num_input_tokens_seen": 16184112, "step": 28060 }, { "epoch": 4.180071492403932, "grad_norm": 0.31037312746047974, "learning_rate": 3.6116484868923174e-05, "loss": 0.8058, "num_input_tokens_seen": 16187152, "step": 28065 }, { "epoch": 4.180816204944891, "grad_norm": 0.33222806453704834, "learning_rate": 3.611066348148597e-05, "loss": 0.7804, "num_input_tokens_seen": 16190288, "step": 28070 }, { "epoch": 4.18156091748585, "grad_norm": 0.24568666517734528, "learning_rate": 3.6104841343235313e-05, "loss": 0.8349, "num_input_tokens_seen": 16193008, "step": 28075 }, { "epoch": 4.18230563002681, "grad_norm": 0.2298187017440796, "learning_rate": 3.609901845456462e-05, "loss": 0.7727, "num_input_tokens_seen": 16196112, "step": 28080 }, { "epoch": 4.1830503425677685, "grad_norm": 0.2306043803691864, "learning_rate": 3.6093194815867385e-05, "loss": 0.783, "num_input_tokens_seen": 16199344, "step": 28085 }, { "epoch": 4.183795055108728, "grad_norm": 0.2205173373222351, "learning_rate": 3.608737042753715e-05, "loss": 0.7993, "num_input_tokens_seen": 16202320, "step": 28090 }, { "epoch": 4.184539767649687, "grad_norm": 0.252575546503067, "learning_rate": 3.608154528996749e-05, "loss": 0.8161, "num_input_tokens_seen": 16205488, "step": 28095 }, { "epoch": 4.185284480190647, "grad_norm": 0.16110378503799438, "learning_rate": 3.607571940355206e-05, "loss": 0.8191, "num_input_tokens_seen": 16208080, "step": 28100 }, { "epoch": 4.186029192731605, "grad_norm": 0.25818130373954773, "learning_rate": 3.606989276868455e-05, "loss": 0.8107, "num_input_tokens_seen": 16210864, "step": 28105 }, { "epoch": 4.186773905272565, "grad_norm": 0.20274591445922852, "learning_rate": 3.606406538575868e-05, "loss": 0.7884, "num_input_tokens_seen": 16213712, "step": 28110 }, { "epoch": 4.187518617813524, "grad_norm": 0.21238788962364197, "learning_rate": 3.605823725516826e-05, "loss": 0.8147, "num_input_tokens_seen": 16216336, "step": 28115 }, { "epoch": 4.188263330354483, "grad_norm": 0.2921294867992401, "learning_rate": 3.605240837730713e-05, "loss": 0.8216, "num_input_tokens_seen": 16219376, "step": 28120 }, { "epoch": 4.189008042895442, "grad_norm": 0.2525225579738617, "learning_rate": 3.604657875256918e-05, "loss": 0.8423, "num_input_tokens_seen": 16222064, "step": 28125 }, { "epoch": 4.189752755436402, "grad_norm": 0.210801899433136, "learning_rate": 3.604074838134834e-05, "loss": 0.8149, "num_input_tokens_seen": 16224848, "step": 28130 }, { "epoch": 4.190497467977361, "grad_norm": 0.2099798619747162, "learning_rate": 3.603491726403862e-05, "loss": 0.7627, "num_input_tokens_seen": 16227792, "step": 28135 }, { "epoch": 4.19124218051832, "grad_norm": 0.28416168689727783, "learning_rate": 3.6029085401034053e-05, "loss": 0.8025, "num_input_tokens_seen": 16230576, "step": 28140 }, { "epoch": 4.191986893059279, "grad_norm": 0.1964341402053833, "learning_rate": 3.602325279272874e-05, "loss": 0.7977, "num_input_tokens_seen": 16233104, "step": 28145 }, { "epoch": 4.192731605600239, "grad_norm": 0.22632791101932526, "learning_rate": 3.6017419439516815e-05, "loss": 0.8229, "num_input_tokens_seen": 16235760, "step": 28150 }, { "epoch": 4.193476318141197, "grad_norm": 0.19747211039066315, "learning_rate": 3.6011585341792477e-05, "loss": 0.8084, "num_input_tokens_seen": 16238512, "step": 28155 }, { "epoch": 4.194221030682157, "grad_norm": 0.20912562310695648, "learning_rate": 3.600575049994997e-05, "loss": 0.8158, "num_input_tokens_seen": 16241360, "step": 28160 }, { "epoch": 4.194965743223116, "grad_norm": 0.2392590045928955, "learning_rate": 3.59999149143836e-05, "loss": 0.8242, "num_input_tokens_seen": 16244496, "step": 28165 }, { "epoch": 4.195710455764075, "grad_norm": 0.3203585147857666, "learning_rate": 3.5994078585487694e-05, "loss": 0.8043, "num_input_tokens_seen": 16247152, "step": 28170 }, { "epoch": 4.196455168305034, "grad_norm": 0.3275793492794037, "learning_rate": 3.5988241513656664e-05, "loss": 0.7823, "num_input_tokens_seen": 16250064, "step": 28175 }, { "epoch": 4.197199880845994, "grad_norm": 0.24217800796031952, "learning_rate": 3.598240369928494e-05, "loss": 0.7909, "num_input_tokens_seen": 16252880, "step": 28180 }, { "epoch": 4.197944593386953, "grad_norm": 0.226661816239357, "learning_rate": 3.5976565142767025e-05, "loss": 0.8132, "num_input_tokens_seen": 16255568, "step": 28185 }, { "epoch": 4.198689305927912, "grad_norm": 0.1912386417388916, "learning_rate": 3.5970725844497465e-05, "loss": 0.8089, "num_input_tokens_seen": 16258576, "step": 28190 }, { "epoch": 4.199434018468871, "grad_norm": 0.1932682991027832, "learning_rate": 3.596488580487086e-05, "loss": 0.8227, "num_input_tokens_seen": 16261584, "step": 28195 }, { "epoch": 4.200178731009831, "grad_norm": 0.25349903106689453, "learning_rate": 3.595904502428185e-05, "loss": 0.8193, "num_input_tokens_seen": 16264240, "step": 28200 }, { "epoch": 4.200923443550789, "grad_norm": 0.2802221179008484, "learning_rate": 3.595320350312513e-05, "loss": 0.8134, "num_input_tokens_seen": 16266864, "step": 28205 }, { "epoch": 4.201668156091749, "grad_norm": 0.3005432188510895, "learning_rate": 3.594736124179546e-05, "loss": 0.7904, "num_input_tokens_seen": 16269840, "step": 28210 }, { "epoch": 4.202412868632708, "grad_norm": 0.24861834943294525, "learning_rate": 3.594151824068762e-05, "loss": 0.785, "num_input_tokens_seen": 16272944, "step": 28215 }, { "epoch": 4.203157581173667, "grad_norm": 0.33399146795272827, "learning_rate": 3.593567450019646e-05, "loss": 0.8299, "num_input_tokens_seen": 16275568, "step": 28220 }, { "epoch": 4.203902293714626, "grad_norm": 0.2823270857334137, "learning_rate": 3.592983002071688e-05, "loss": 0.8161, "num_input_tokens_seen": 16278832, "step": 28225 }, { "epoch": 4.204647006255585, "grad_norm": 0.2408331036567688, "learning_rate": 3.5923984802643826e-05, "loss": 0.7953, "num_input_tokens_seen": 16281648, "step": 28230 }, { "epoch": 4.205391718796545, "grad_norm": 0.27036333084106445, "learning_rate": 3.59181388463723e-05, "loss": 0.7757, "num_input_tokens_seen": 16284464, "step": 28235 }, { "epoch": 4.206136431337503, "grad_norm": 0.2293090969324112, "learning_rate": 3.591229215229733e-05, "loss": 0.7767, "num_input_tokens_seen": 16287280, "step": 28240 }, { "epoch": 4.206881143878463, "grad_norm": 0.24311134219169617, "learning_rate": 3.590644472081402e-05, "loss": 0.8003, "num_input_tokens_seen": 16290160, "step": 28245 }, { "epoch": 4.207625856419422, "grad_norm": 0.2457781732082367, "learning_rate": 3.5900596552317526e-05, "loss": 0.8141, "num_input_tokens_seen": 16293328, "step": 28250 }, { "epoch": 4.208370568960381, "grad_norm": 0.22227466106414795, "learning_rate": 3.589474764720303e-05, "loss": 0.8003, "num_input_tokens_seen": 16296272, "step": 28255 }, { "epoch": 4.20911528150134, "grad_norm": 0.23805494606494904, "learning_rate": 3.588889800586579e-05, "loss": 0.8027, "num_input_tokens_seen": 16299056, "step": 28260 }, { "epoch": 4.2098599940423, "grad_norm": 0.24360474944114685, "learning_rate": 3.588304762870108e-05, "loss": 0.8088, "num_input_tokens_seen": 16301936, "step": 28265 }, { "epoch": 4.210604706583259, "grad_norm": 0.2322738915681839, "learning_rate": 3.5877196516104275e-05, "loss": 0.797, "num_input_tokens_seen": 16304784, "step": 28270 }, { "epoch": 4.211349419124218, "grad_norm": 0.39579805731773376, "learning_rate": 3.5871344668470755e-05, "loss": 0.8325, "num_input_tokens_seen": 16307920, "step": 28275 }, { "epoch": 4.212094131665177, "grad_norm": 0.23919233679771423, "learning_rate": 3.5865492086195945e-05, "loss": 0.8089, "num_input_tokens_seen": 16310768, "step": 28280 }, { "epoch": 4.212838844206137, "grad_norm": 0.21593759953975677, "learning_rate": 3.585963876967536e-05, "loss": 0.8276, "num_input_tokens_seen": 16313616, "step": 28285 }, { "epoch": 4.213583556747095, "grad_norm": 0.24026328325271606, "learning_rate": 3.585378471930455e-05, "loss": 0.7809, "num_input_tokens_seen": 16316464, "step": 28290 }, { "epoch": 4.214328269288055, "grad_norm": 0.20858453214168549, "learning_rate": 3.584792993547908e-05, "loss": 0.7996, "num_input_tokens_seen": 16318992, "step": 28295 }, { "epoch": 4.215072981829014, "grad_norm": 0.28401821851730347, "learning_rate": 3.5842074418594625e-05, "loss": 0.8048, "num_input_tokens_seen": 16321968, "step": 28300 }, { "epoch": 4.2158176943699734, "grad_norm": 0.25085824728012085, "learning_rate": 3.583621816904686e-05, "loss": 0.7924, "num_input_tokens_seen": 16325072, "step": 28305 }, { "epoch": 4.216562406910932, "grad_norm": 0.2089659571647644, "learning_rate": 3.583036118723152e-05, "loss": 0.8086, "num_input_tokens_seen": 16327920, "step": 28310 }, { "epoch": 4.217307119451892, "grad_norm": 0.22614875435829163, "learning_rate": 3.5824503473544405e-05, "loss": 0.7823, "num_input_tokens_seen": 16330864, "step": 28315 }, { "epoch": 4.218051831992851, "grad_norm": 0.24338802695274353, "learning_rate": 3.5818645028381356e-05, "loss": 0.802, "num_input_tokens_seen": 16333712, "step": 28320 }, { "epoch": 4.21879654453381, "grad_norm": 0.25956764817237854, "learning_rate": 3.581278585213826e-05, "loss": 0.7928, "num_input_tokens_seen": 16337072, "step": 28325 }, { "epoch": 4.219541257074769, "grad_norm": 0.2753530740737915, "learning_rate": 3.5806925945211065e-05, "loss": 0.7933, "num_input_tokens_seen": 16339888, "step": 28330 }, { "epoch": 4.220285969615729, "grad_norm": 0.1929391622543335, "learning_rate": 3.580106530799575e-05, "loss": 0.8108, "num_input_tokens_seen": 16342672, "step": 28335 }, { "epoch": 4.221030682156687, "grad_norm": 0.25044000148773193, "learning_rate": 3.579520394088835e-05, "loss": 0.8011, "num_input_tokens_seen": 16345328, "step": 28340 }, { "epoch": 4.221775394697647, "grad_norm": 0.26600030064582825, "learning_rate": 3.578934184428496e-05, "loss": 0.7727, "num_input_tokens_seen": 16348080, "step": 28345 }, { "epoch": 4.222520107238606, "grad_norm": 0.23086009919643402, "learning_rate": 3.578347901858172e-05, "loss": 0.8007, "num_input_tokens_seen": 16350896, "step": 28350 }, { "epoch": 4.2232648197795655, "grad_norm": 0.216172456741333, "learning_rate": 3.57776154641748e-05, "loss": 0.7766, "num_input_tokens_seen": 16354160, "step": 28355 }, { "epoch": 4.224009532320524, "grad_norm": 0.2613624632358551, "learning_rate": 3.577175118146045e-05, "loss": 0.7917, "num_input_tokens_seen": 16356976, "step": 28360 }, { "epoch": 4.224754244861484, "grad_norm": 0.17089708149433136, "learning_rate": 3.576588617083495e-05, "loss": 0.7842, "num_input_tokens_seen": 16359888, "step": 28365 }, { "epoch": 4.225498957402443, "grad_norm": 0.2390586882829666, "learning_rate": 3.576002043269464e-05, "loss": 0.8011, "num_input_tokens_seen": 16362640, "step": 28370 }, { "epoch": 4.226243669943402, "grad_norm": 0.259037047624588, "learning_rate": 3.575415396743589e-05, "loss": 0.8226, "num_input_tokens_seen": 16365712, "step": 28375 }, { "epoch": 4.226988382484361, "grad_norm": 0.16852329671382904, "learning_rate": 3.574828677545514e-05, "loss": 0.8132, "num_input_tokens_seen": 16368464, "step": 28380 }, { "epoch": 4.22773309502532, "grad_norm": 0.27734580636024475, "learning_rate": 3.574241885714886e-05, "loss": 0.7893, "num_input_tokens_seen": 16371760, "step": 28385 }, { "epoch": 4.2284778075662794, "grad_norm": 0.41865968704223633, "learning_rate": 3.57365502129136e-05, "loss": 0.8427, "num_input_tokens_seen": 16374544, "step": 28390 }, { "epoch": 4.229222520107238, "grad_norm": 0.26700976490974426, "learning_rate": 3.573068084314593e-05, "loss": 0.7942, "num_input_tokens_seen": 16377264, "step": 28395 }, { "epoch": 4.229967232648198, "grad_norm": 0.28370368480682373, "learning_rate": 3.572481074824247e-05, "loss": 0.789, "num_input_tokens_seen": 16380208, "step": 28400 }, { "epoch": 4.230711945189157, "grad_norm": 0.27154234051704407, "learning_rate": 3.5718939928599904e-05, "loss": 0.8051, "num_input_tokens_seen": 16382832, "step": 28405 }, { "epoch": 4.231456657730116, "grad_norm": 0.22236500680446625, "learning_rate": 3.571306838461496e-05, "loss": 0.8069, "num_input_tokens_seen": 16385648, "step": 28410 }, { "epoch": 4.232201370271075, "grad_norm": 0.23434196412563324, "learning_rate": 3.570719611668441e-05, "loss": 0.8219, "num_input_tokens_seen": 16388080, "step": 28415 }, { "epoch": 4.232946082812035, "grad_norm": 0.22983594238758087, "learning_rate": 3.5701323125205076e-05, "loss": 0.8242, "num_input_tokens_seen": 16390928, "step": 28420 }, { "epoch": 4.233690795352993, "grad_norm": 0.21117277443408966, "learning_rate": 3.569544941057384e-05, "loss": 0.7481, "num_input_tokens_seen": 16393648, "step": 28425 }, { "epoch": 4.234435507893953, "grad_norm": 0.15939253568649292, "learning_rate": 3.568957497318761e-05, "loss": 0.8028, "num_input_tokens_seen": 16396496, "step": 28430 }, { "epoch": 4.235180220434912, "grad_norm": 0.19713500142097473, "learning_rate": 3.5683699813443364e-05, "loss": 0.8312, "num_input_tokens_seen": 16399408, "step": 28435 }, { "epoch": 4.2359249329758715, "grad_norm": 0.23887576162815094, "learning_rate": 3.567782393173813e-05, "loss": 0.8193, "num_input_tokens_seen": 16402352, "step": 28440 }, { "epoch": 4.23666964551683, "grad_norm": 0.18259644508361816, "learning_rate": 3.567194732846896e-05, "loss": 0.8062, "num_input_tokens_seen": 16405200, "step": 28445 }, { "epoch": 4.23741435805779, "grad_norm": 0.23422585427761078, "learning_rate": 3.566607000403298e-05, "loss": 0.8015, "num_input_tokens_seen": 16407984, "step": 28450 }, { "epoch": 4.238159070598749, "grad_norm": 0.2710086405277252, "learning_rate": 3.5660191958827354e-05, "loss": 0.7972, "num_input_tokens_seen": 16410768, "step": 28455 }, { "epoch": 4.238903783139708, "grad_norm": 0.23064163327217102, "learning_rate": 3.56543131932493e-05, "loss": 0.8024, "num_input_tokens_seen": 16413456, "step": 28460 }, { "epoch": 4.239648495680667, "grad_norm": 0.35672903060913086, "learning_rate": 3.5648433707696074e-05, "loss": 0.7983, "num_input_tokens_seen": 16416080, "step": 28465 }, { "epoch": 4.240393208221627, "grad_norm": 0.21671433746814728, "learning_rate": 3.564255350256499e-05, "loss": 0.7975, "num_input_tokens_seen": 16418992, "step": 28470 }, { "epoch": 4.2411379207625854, "grad_norm": 0.3286229074001312, "learning_rate": 3.5636672578253415e-05, "loss": 0.8535, "num_input_tokens_seen": 16422032, "step": 28475 }, { "epoch": 4.241882633303545, "grad_norm": 0.2346491813659668, "learning_rate": 3.5630790935158754e-05, "loss": 0.8156, "num_input_tokens_seen": 16424720, "step": 28480 }, { "epoch": 4.242627345844504, "grad_norm": 0.2744571268558502, "learning_rate": 3.562490857367845e-05, "loss": 0.8174, "num_input_tokens_seen": 16427376, "step": 28485 }, { "epoch": 4.2433720583854635, "grad_norm": 0.2505300045013428, "learning_rate": 3.561902549421004e-05, "loss": 0.7946, "num_input_tokens_seen": 16430224, "step": 28490 }, { "epoch": 4.244116770926422, "grad_norm": 0.25339481234550476, "learning_rate": 3.5613141697151055e-05, "loss": 0.8222, "num_input_tokens_seen": 16432880, "step": 28495 }, { "epoch": 4.244861483467382, "grad_norm": 0.2373078614473343, "learning_rate": 3.5607257182899095e-05, "loss": 0.8322, "num_input_tokens_seen": 16435728, "step": 28500 }, { "epoch": 4.245606196008341, "grad_norm": 0.21096037328243256, "learning_rate": 3.560137195185183e-05, "loss": 0.7746, "num_input_tokens_seen": 16438448, "step": 28505 }, { "epoch": 4.2463509085493, "grad_norm": 0.21719829738140106, "learning_rate": 3.559548600440695e-05, "loss": 0.8151, "num_input_tokens_seen": 16441424, "step": 28510 }, { "epoch": 4.247095621090259, "grad_norm": 0.22955238819122314, "learning_rate": 3.5589599340962196e-05, "loss": 0.8157, "num_input_tokens_seen": 16444240, "step": 28515 }, { "epoch": 4.247840333631219, "grad_norm": 0.24696579575538635, "learning_rate": 3.5583711961915375e-05, "loss": 0.827, "num_input_tokens_seen": 16447024, "step": 28520 }, { "epoch": 4.2485850461721775, "grad_norm": 0.20531068742275238, "learning_rate": 3.557782386766434e-05, "loss": 0.7769, "num_input_tokens_seen": 16449776, "step": 28525 }, { "epoch": 4.249329758713137, "grad_norm": 0.20912300050258636, "learning_rate": 3.557193505860696e-05, "loss": 0.8141, "num_input_tokens_seen": 16452720, "step": 28530 }, { "epoch": 4.250074471254096, "grad_norm": 0.2411985546350479, "learning_rate": 3.55660455351412e-05, "loss": 0.8154, "num_input_tokens_seen": 16455632, "step": 28535 }, { "epoch": 4.2508191837950555, "grad_norm": 0.23314176499843597, "learning_rate": 3.5560155297665046e-05, "loss": 0.7919, "num_input_tokens_seen": 16458608, "step": 28540 }, { "epoch": 4.251563896336014, "grad_norm": 0.2523532509803772, "learning_rate": 3.555426434657652e-05, "loss": 0.8085, "num_input_tokens_seen": 16461360, "step": 28545 }, { "epoch": 4.252308608876973, "grad_norm": 0.20304237306118011, "learning_rate": 3.5548372682273726e-05, "loss": 0.8021, "num_input_tokens_seen": 16464208, "step": 28550 }, { "epoch": 4.253053321417933, "grad_norm": 0.23432478308677673, "learning_rate": 3.554248030515479e-05, "loss": 0.7905, "num_input_tokens_seen": 16467152, "step": 28555 }, { "epoch": 4.253798033958892, "grad_norm": 0.30264225602149963, "learning_rate": 3.55365872156179e-05, "loss": 0.8229, "num_input_tokens_seen": 16469936, "step": 28560 }, { "epoch": 4.254542746499851, "grad_norm": 0.26535919308662415, "learning_rate": 3.5530693414061285e-05, "loss": 0.811, "num_input_tokens_seen": 16472592, "step": 28565 }, { "epoch": 4.25528745904081, "grad_norm": 0.17756307125091553, "learning_rate": 3.5524798900883226e-05, "loss": 0.8011, "num_input_tokens_seen": 16475600, "step": 28570 }, { "epoch": 4.2560321715817695, "grad_norm": 0.32157522439956665, "learning_rate": 3.551890367648205e-05, "loss": 0.8027, "num_input_tokens_seen": 16478416, "step": 28575 }, { "epoch": 4.256776884122728, "grad_norm": 0.263780415058136, "learning_rate": 3.551300774125611e-05, "loss": 0.8153, "num_input_tokens_seen": 16481136, "step": 28580 }, { "epoch": 4.257521596663688, "grad_norm": 0.24001823365688324, "learning_rate": 3.5507111095603864e-05, "loss": 0.8368, "num_input_tokens_seen": 16484112, "step": 28585 }, { "epoch": 4.258266309204647, "grad_norm": 0.20299112796783447, "learning_rate": 3.550121373992378e-05, "loss": 0.7801, "num_input_tokens_seen": 16486896, "step": 28590 }, { "epoch": 4.259011021745606, "grad_norm": 0.26179268956184387, "learning_rate": 3.5495315674614356e-05, "loss": 0.8209, "num_input_tokens_seen": 16489552, "step": 28595 }, { "epoch": 4.259755734286565, "grad_norm": 0.270231157541275, "learning_rate": 3.548941690007417e-05, "loss": 0.8264, "num_input_tokens_seen": 16492560, "step": 28600 }, { "epoch": 4.260500446827525, "grad_norm": 0.327815443277359, "learning_rate": 3.5483517416701836e-05, "loss": 0.795, "num_input_tokens_seen": 16495248, "step": 28605 }, { "epoch": 4.2612451593684835, "grad_norm": 0.22339019179344177, "learning_rate": 3.547761722489602e-05, "loss": 0.7994, "num_input_tokens_seen": 16498096, "step": 28610 }, { "epoch": 4.261989871909443, "grad_norm": 0.2817196846008301, "learning_rate": 3.5471716325055424e-05, "loss": 0.8124, "num_input_tokens_seen": 16500848, "step": 28615 }, { "epoch": 4.262734584450402, "grad_norm": 0.24023699760437012, "learning_rate": 3.5465814717578815e-05, "loss": 0.7848, "num_input_tokens_seen": 16504016, "step": 28620 }, { "epoch": 4.2634792969913615, "grad_norm": 0.22633777558803558, "learning_rate": 3.5459912402865006e-05, "loss": 0.7898, "num_input_tokens_seen": 16506704, "step": 28625 }, { "epoch": 4.26422400953232, "grad_norm": 0.21221493184566498, "learning_rate": 3.545400938131284e-05, "loss": 0.7887, "num_input_tokens_seen": 16509456, "step": 28630 }, { "epoch": 4.26496872207328, "grad_norm": 0.28692662715911865, "learning_rate": 3.544810565332122e-05, "loss": 0.7916, "num_input_tokens_seen": 16512560, "step": 28635 }, { "epoch": 4.265713434614239, "grad_norm": 0.30106860399246216, "learning_rate": 3.5442201219289105e-05, "loss": 0.8103, "num_input_tokens_seen": 16515504, "step": 28640 }, { "epoch": 4.266458147155198, "grad_norm": 0.2495180368423462, "learning_rate": 3.543629607961548e-05, "loss": 0.7746, "num_input_tokens_seen": 16518832, "step": 28645 }, { "epoch": 4.267202859696157, "grad_norm": 0.2326778769493103, "learning_rate": 3.5430390234699404e-05, "loss": 0.8144, "num_input_tokens_seen": 16521552, "step": 28650 }, { "epoch": 4.267947572237117, "grad_norm": 0.20149531960487366, "learning_rate": 3.542448368493996e-05, "loss": 0.8189, "num_input_tokens_seen": 16524560, "step": 28655 }, { "epoch": 4.2686922847780755, "grad_norm": 0.22215956449508667, "learning_rate": 3.5418576430736285e-05, "loss": 0.7564, "num_input_tokens_seen": 16527696, "step": 28660 }, { "epoch": 4.269436997319035, "grad_norm": 0.26109328866004944, "learning_rate": 3.5412668472487575e-05, "loss": 0.8015, "num_input_tokens_seen": 16530512, "step": 28665 }, { "epoch": 4.270181709859994, "grad_norm": 0.31083157658576965, "learning_rate": 3.540675981059307e-05, "loss": 0.8081, "num_input_tokens_seen": 16533392, "step": 28670 }, { "epoch": 4.2709264224009535, "grad_norm": 0.21087688207626343, "learning_rate": 3.540085044545205e-05, "loss": 0.8303, "num_input_tokens_seen": 16535952, "step": 28675 }, { "epoch": 4.271671134941912, "grad_norm": 0.38113996386528015, "learning_rate": 3.539494037746384e-05, "loss": 0.8007, "num_input_tokens_seen": 16538896, "step": 28680 }, { "epoch": 4.272415847482872, "grad_norm": 0.26723045110702515, "learning_rate": 3.538902960702781e-05, "loss": 0.8019, "num_input_tokens_seen": 16541584, "step": 28685 }, { "epoch": 4.273160560023831, "grad_norm": 0.17775166034698486, "learning_rate": 3.538311813454342e-05, "loss": 0.8256, "num_input_tokens_seen": 16544368, "step": 28690 }, { "epoch": 4.27390527256479, "grad_norm": 0.2721121609210968, "learning_rate": 3.537720596041011e-05, "loss": 0.7993, "num_input_tokens_seen": 16547216, "step": 28695 }, { "epoch": 4.274649985105749, "grad_norm": 0.24009719491004944, "learning_rate": 3.537129308502741e-05, "loss": 0.7835, "num_input_tokens_seen": 16550384, "step": 28700 }, { "epoch": 4.275394697646709, "grad_norm": 0.2307170331478119, "learning_rate": 3.536537950879489e-05, "loss": 0.7756, "num_input_tokens_seen": 16553072, "step": 28705 }, { "epoch": 4.2761394101876675, "grad_norm": 0.2544553577899933, "learning_rate": 3.535946523211217e-05, "loss": 0.7861, "num_input_tokens_seen": 16555760, "step": 28710 }, { "epoch": 4.276884122728626, "grad_norm": 0.24051567912101746, "learning_rate": 3.5353550255378905e-05, "loss": 0.7975, "num_input_tokens_seen": 16558768, "step": 28715 }, { "epoch": 4.277628835269586, "grad_norm": 0.3201752007007599, "learning_rate": 3.5347634578994806e-05, "loss": 0.806, "num_input_tokens_seen": 16561904, "step": 28720 }, { "epoch": 4.278373547810546, "grad_norm": 0.2727935016155243, "learning_rate": 3.534171820335964e-05, "loss": 0.8228, "num_input_tokens_seen": 16564976, "step": 28725 }, { "epoch": 4.279118260351504, "grad_norm": 0.23530510067939758, "learning_rate": 3.53358011288732e-05, "loss": 0.8305, "num_input_tokens_seen": 16567824, "step": 28730 }, { "epoch": 4.279862972892463, "grad_norm": 0.23688440024852753, "learning_rate": 3.532988335593534e-05, "loss": 0.7924, "num_input_tokens_seen": 16570192, "step": 28735 }, { "epoch": 4.280607685433423, "grad_norm": 0.2557967007160187, "learning_rate": 3.532396488494596e-05, "loss": 0.7767, "num_input_tokens_seen": 16573104, "step": 28740 }, { "epoch": 4.2813523979743815, "grad_norm": 0.2414833903312683, "learning_rate": 3.531804571630501e-05, "loss": 0.7592, "num_input_tokens_seen": 16576144, "step": 28745 }, { "epoch": 4.282097110515341, "grad_norm": 0.30949610471725464, "learning_rate": 3.531212585041248e-05, "loss": 0.8138, "num_input_tokens_seen": 16579024, "step": 28750 }, { "epoch": 4.2828418230563, "grad_norm": 0.23987381160259247, "learning_rate": 3.530620528766841e-05, "loss": 0.8266, "num_input_tokens_seen": 16581904, "step": 28755 }, { "epoch": 4.2835865355972595, "grad_norm": 0.20771722495555878, "learning_rate": 3.53002840284729e-05, "loss": 0.7853, "num_input_tokens_seen": 16584496, "step": 28760 }, { "epoch": 4.284331248138218, "grad_norm": 0.26956555247306824, "learning_rate": 3.5294362073226054e-05, "loss": 0.8257, "num_input_tokens_seen": 16587248, "step": 28765 }, { "epoch": 4.285075960679178, "grad_norm": 0.2133752852678299, "learning_rate": 3.528843942232809e-05, "loss": 0.7841, "num_input_tokens_seen": 16590032, "step": 28770 }, { "epoch": 4.285820673220137, "grad_norm": 0.20203235745429993, "learning_rate": 3.528251607617921e-05, "loss": 0.7887, "num_input_tokens_seen": 16592752, "step": 28775 }, { "epoch": 4.286565385761096, "grad_norm": 0.21412169933319092, "learning_rate": 3.52765920351797e-05, "loss": 0.7984, "num_input_tokens_seen": 16595568, "step": 28780 }, { "epoch": 4.287310098302055, "grad_norm": 0.2487037628889084, "learning_rate": 3.5270667299729883e-05, "loss": 0.8019, "num_input_tokens_seen": 16598320, "step": 28785 }, { "epoch": 4.288054810843015, "grad_norm": 0.26223868131637573, "learning_rate": 3.526474187023013e-05, "loss": 0.7978, "num_input_tokens_seen": 16601072, "step": 28790 }, { "epoch": 4.2887995233839735, "grad_norm": 0.1797129362821579, "learning_rate": 3.5258815747080853e-05, "loss": 0.8233, "num_input_tokens_seen": 16603856, "step": 28795 }, { "epoch": 4.289544235924933, "grad_norm": 0.22522780299186707, "learning_rate": 3.5252888930682516e-05, "loss": 0.7831, "num_input_tokens_seen": 16606832, "step": 28800 }, { "epoch": 4.290288948465892, "grad_norm": 0.28203871846199036, "learning_rate": 3.524696142143563e-05, "loss": 0.7747, "num_input_tokens_seen": 16610032, "step": 28805 }, { "epoch": 4.291033661006852, "grad_norm": 0.22806371748447418, "learning_rate": 3.524103321974075e-05, "loss": 0.8266, "num_input_tokens_seen": 16612784, "step": 28810 }, { "epoch": 4.29177837354781, "grad_norm": 0.171585351228714, "learning_rate": 3.523510432599849e-05, "loss": 0.8011, "num_input_tokens_seen": 16615600, "step": 28815 }, { "epoch": 4.29252308608877, "grad_norm": 0.21263402700424194, "learning_rate": 3.522917474060949e-05, "loss": 0.815, "num_input_tokens_seen": 16618544, "step": 28820 }, { "epoch": 4.293267798629729, "grad_norm": 0.22741137444972992, "learning_rate": 3.522324446397444e-05, "loss": 0.8216, "num_input_tokens_seen": 16621296, "step": 28825 }, { "epoch": 4.294012511170688, "grad_norm": 0.27001044154167175, "learning_rate": 3.5217313496494096e-05, "loss": 0.7768, "num_input_tokens_seen": 16624144, "step": 28830 }, { "epoch": 4.294757223711647, "grad_norm": 0.2794148623943329, "learning_rate": 3.521138183856926e-05, "loss": 0.8188, "num_input_tokens_seen": 16627216, "step": 28835 }, { "epoch": 4.295501936252607, "grad_norm": 0.22284241020679474, "learning_rate": 3.520544949060075e-05, "loss": 0.8047, "num_input_tokens_seen": 16630160, "step": 28840 }, { "epoch": 4.2962466487935655, "grad_norm": 0.1977439820766449, "learning_rate": 3.5199516452989444e-05, "loss": 0.7989, "num_input_tokens_seen": 16632784, "step": 28845 }, { "epoch": 4.296991361334525, "grad_norm": 0.22761425375938416, "learning_rate": 3.51935827261363e-05, "loss": 0.8218, "num_input_tokens_seen": 16635600, "step": 28850 }, { "epoch": 4.297736073875484, "grad_norm": 0.26961079239845276, "learning_rate": 3.518764831044228e-05, "loss": 0.7672, "num_input_tokens_seen": 16638640, "step": 28855 }, { "epoch": 4.298480786416444, "grad_norm": 0.2551229000091553, "learning_rate": 3.518171320630839e-05, "loss": 0.8137, "num_input_tokens_seen": 16641616, "step": 28860 }, { "epoch": 4.299225498957402, "grad_norm": 0.23272007703781128, "learning_rate": 3.5175777414135726e-05, "loss": 0.7928, "num_input_tokens_seen": 16644592, "step": 28865 }, { "epoch": 4.299970211498362, "grad_norm": 0.2728208601474762, "learning_rate": 3.5169840934325404e-05, "loss": 0.8245, "num_input_tokens_seen": 16647536, "step": 28870 }, { "epoch": 4.300714924039321, "grad_norm": 0.27815747261047363, "learning_rate": 3.5163903767278573e-05, "loss": 0.806, "num_input_tokens_seen": 16650512, "step": 28875 }, { "epoch": 4.30145963658028, "grad_norm": 0.24175859987735748, "learning_rate": 3.515796591339644e-05, "loss": 0.7838, "num_input_tokens_seen": 16653200, "step": 28880 }, { "epoch": 4.302204349121239, "grad_norm": 0.26261892914772034, "learning_rate": 3.515202737308028e-05, "loss": 0.7784, "num_input_tokens_seen": 16655920, "step": 28885 }, { "epoch": 4.302949061662199, "grad_norm": 0.23298189043998718, "learning_rate": 3.514608814673139e-05, "loss": 0.7763, "num_input_tokens_seen": 16658896, "step": 28890 }, { "epoch": 4.303693774203158, "grad_norm": 0.231049045920372, "learning_rate": 3.5140148234751106e-05, "loss": 0.796, "num_input_tokens_seen": 16661712, "step": 28895 }, { "epoch": 4.304438486744116, "grad_norm": 0.1964990645647049, "learning_rate": 3.513420763754083e-05, "loss": 0.8278, "num_input_tokens_seen": 16664368, "step": 28900 }, { "epoch": 4.305183199285076, "grad_norm": 0.24093857407569885, "learning_rate": 3.512826635550201e-05, "loss": 0.797, "num_input_tokens_seen": 16667184, "step": 28905 }, { "epoch": 4.305927911826035, "grad_norm": 0.3001805543899536, "learning_rate": 3.512232438903612e-05, "loss": 0.8021, "num_input_tokens_seen": 16670416, "step": 28910 }, { "epoch": 4.306672624366994, "grad_norm": 0.2860085070133209, "learning_rate": 3.511638173854471e-05, "loss": 0.8054, "num_input_tokens_seen": 16673264, "step": 28915 }, { "epoch": 4.307417336907953, "grad_norm": 0.1744430959224701, "learning_rate": 3.511043840442936e-05, "loss": 0.7908, "num_input_tokens_seen": 16676112, "step": 28920 }, { "epoch": 4.308162049448913, "grad_norm": 0.21971847116947174, "learning_rate": 3.510449438709167e-05, "loss": 0.817, "num_input_tokens_seen": 16678928, "step": 28925 }, { "epoch": 4.3089067619898715, "grad_norm": 0.18740314245224, "learning_rate": 3.509854968693334e-05, "loss": 0.7979, "num_input_tokens_seen": 16682128, "step": 28930 }, { "epoch": 4.309651474530831, "grad_norm": 0.24018117785453796, "learning_rate": 3.509260430435608e-05, "loss": 0.784, "num_input_tokens_seen": 16685136, "step": 28935 }, { "epoch": 4.31039618707179, "grad_norm": 0.1941216140985489, "learning_rate": 3.5086658239761664e-05, "loss": 0.8039, "num_input_tokens_seen": 16688016, "step": 28940 }, { "epoch": 4.31114089961275, "grad_norm": 0.23811738193035126, "learning_rate": 3.5080711493551876e-05, "loss": 0.8241, "num_input_tokens_seen": 16690864, "step": 28945 }, { "epoch": 4.311885612153708, "grad_norm": 0.287436842918396, "learning_rate": 3.5074764066128594e-05, "loss": 0.7991, "num_input_tokens_seen": 16693680, "step": 28950 }, { "epoch": 4.312630324694668, "grad_norm": 0.2769479751586914, "learning_rate": 3.506881595789373e-05, "loss": 0.808, "num_input_tokens_seen": 16696752, "step": 28955 }, { "epoch": 4.313375037235627, "grad_norm": 0.24307166039943695, "learning_rate": 3.506286716924921e-05, "loss": 0.8192, "num_input_tokens_seen": 16699536, "step": 28960 }, { "epoch": 4.314119749776586, "grad_norm": 0.2508835196495056, "learning_rate": 3.505691770059704e-05, "loss": 0.796, "num_input_tokens_seen": 16702128, "step": 28965 }, { "epoch": 4.314864462317545, "grad_norm": 0.23875200748443604, "learning_rate": 3.5050967552339265e-05, "loss": 0.8469, "num_input_tokens_seen": 16704816, "step": 28970 }, { "epoch": 4.315609174858505, "grad_norm": 0.22317945957183838, "learning_rate": 3.5045016724877967e-05, "loss": 0.8026, "num_input_tokens_seen": 16707568, "step": 28975 }, { "epoch": 4.316353887399464, "grad_norm": 0.25098717212677, "learning_rate": 3.503906521861527e-05, "loss": 0.8197, "num_input_tokens_seen": 16710800, "step": 28980 }, { "epoch": 4.317098599940423, "grad_norm": 0.27907589077949524, "learning_rate": 3.503311303395337e-05, "loss": 0.7975, "num_input_tokens_seen": 16713456, "step": 28985 }, { "epoch": 4.317843312481382, "grad_norm": 0.19184431433677673, "learning_rate": 3.5027160171294476e-05, "loss": 0.7939, "num_input_tokens_seen": 16715984, "step": 28990 }, { "epoch": 4.318588025022342, "grad_norm": 0.221233069896698, "learning_rate": 3.502120663104087e-05, "loss": 0.82, "num_input_tokens_seen": 16718704, "step": 28995 }, { "epoch": 4.3193327375633, "grad_norm": 0.23664143681526184, "learning_rate": 3.5015252413594864e-05, "loss": 0.7974, "num_input_tokens_seen": 16721552, "step": 29000 }, { "epoch": 4.32007745010426, "grad_norm": 0.17279572784900665, "learning_rate": 3.5009297519358816e-05, "loss": 0.7992, "num_input_tokens_seen": 16724304, "step": 29005 }, { "epoch": 4.320822162645219, "grad_norm": 0.3073805570602417, "learning_rate": 3.500334194873513e-05, "loss": 0.8128, "num_input_tokens_seen": 16727248, "step": 29010 }, { "epoch": 4.321566875186178, "grad_norm": 0.23656374216079712, "learning_rate": 3.499738570212628e-05, "loss": 0.802, "num_input_tokens_seen": 16730000, "step": 29015 }, { "epoch": 4.322311587727137, "grad_norm": 0.24371767044067383, "learning_rate": 3.4991428779934746e-05, "loss": 0.8169, "num_input_tokens_seen": 16732752, "step": 29020 }, { "epoch": 4.323056300268097, "grad_norm": 0.2612355649471283, "learning_rate": 3.498547118256307e-05, "loss": 0.7876, "num_input_tokens_seen": 16735984, "step": 29025 }, { "epoch": 4.323801012809056, "grad_norm": 0.1888614445924759, "learning_rate": 3.497951291041386e-05, "loss": 0.801, "num_input_tokens_seen": 16738640, "step": 29030 }, { "epoch": 4.324545725350015, "grad_norm": 0.16087493300437927, "learning_rate": 3.497355396388974e-05, "loss": 0.8061, "num_input_tokens_seen": 16741552, "step": 29035 }, { "epoch": 4.325290437890974, "grad_norm": 0.19467094540596008, "learning_rate": 3.496759434339338e-05, "loss": 0.7816, "num_input_tokens_seen": 16744240, "step": 29040 }, { "epoch": 4.326035150431934, "grad_norm": 0.2714962363243103, "learning_rate": 3.4961634049327527e-05, "loss": 0.7867, "num_input_tokens_seen": 16747216, "step": 29045 }, { "epoch": 4.326779862972892, "grad_norm": 0.25936996936798096, "learning_rate": 3.495567308209495e-05, "loss": 0.8145, "num_input_tokens_seen": 16750256, "step": 29050 }, { "epoch": 4.327524575513852, "grad_norm": 0.289585143327713, "learning_rate": 3.4949711442098464e-05, "loss": 0.8135, "num_input_tokens_seen": 16753072, "step": 29055 }, { "epoch": 4.328269288054811, "grad_norm": 0.307515412569046, "learning_rate": 3.494374912974093e-05, "loss": 0.8005, "num_input_tokens_seen": 16756144, "step": 29060 }, { "epoch": 4.32901400059577, "grad_norm": 0.18354816734790802, "learning_rate": 3.493778614542525e-05, "loss": 0.8096, "num_input_tokens_seen": 16759440, "step": 29065 }, { "epoch": 4.329758713136729, "grad_norm": 0.2285536378622055, "learning_rate": 3.493182248955439e-05, "loss": 0.7975, "num_input_tokens_seen": 16762320, "step": 29070 }, { "epoch": 4.330503425677689, "grad_norm": 0.227910578250885, "learning_rate": 3.4925858162531354e-05, "loss": 0.8219, "num_input_tokens_seen": 16765456, "step": 29075 }, { "epoch": 4.331248138218648, "grad_norm": 0.3144631087779999, "learning_rate": 3.491989316475917e-05, "loss": 0.7992, "num_input_tokens_seen": 16768368, "step": 29080 }, { "epoch": 4.331992850759606, "grad_norm": 0.2787826657295227, "learning_rate": 3.491392749664094e-05, "loss": 0.79, "num_input_tokens_seen": 16771248, "step": 29085 }, { "epoch": 4.332737563300566, "grad_norm": 0.20577853918075562, "learning_rate": 3.49079611585798e-05, "loss": 0.8109, "num_input_tokens_seen": 16774288, "step": 29090 }, { "epoch": 4.333482275841525, "grad_norm": 0.35279735922813416, "learning_rate": 3.490199415097892e-05, "loss": 0.8028, "num_input_tokens_seen": 16777008, "step": 29095 }, { "epoch": 4.334226988382484, "grad_norm": 0.24473312497138977, "learning_rate": 3.489602647424154e-05, "loss": 0.8116, "num_input_tokens_seen": 16779760, "step": 29100 }, { "epoch": 4.334971700923443, "grad_norm": 0.214035764336586, "learning_rate": 3.489005812877093e-05, "loss": 0.8094, "num_input_tokens_seen": 16782512, "step": 29105 }, { "epoch": 4.335716413464403, "grad_norm": 0.20369915664196014, "learning_rate": 3.488408911497039e-05, "loss": 0.7861, "num_input_tokens_seen": 16785680, "step": 29110 }, { "epoch": 4.336461126005362, "grad_norm": 0.22589965164661407, "learning_rate": 3.48781194332433e-05, "loss": 0.7622, "num_input_tokens_seen": 16788592, "step": 29115 }, { "epoch": 4.337205838546321, "grad_norm": 0.2151351273059845, "learning_rate": 3.487214908399306e-05, "loss": 0.8072, "num_input_tokens_seen": 16791632, "step": 29120 }, { "epoch": 4.33795055108728, "grad_norm": 0.23032155632972717, "learning_rate": 3.486617806762312e-05, "loss": 0.8023, "num_input_tokens_seen": 16794672, "step": 29125 }, { "epoch": 4.33869526362824, "grad_norm": 0.20922790467739105, "learning_rate": 3.486020638453698e-05, "loss": 0.8125, "num_input_tokens_seen": 16797328, "step": 29130 }, { "epoch": 4.339439976169198, "grad_norm": 0.19054454565048218, "learning_rate": 3.485423403513818e-05, "loss": 0.8147, "num_input_tokens_seen": 16800048, "step": 29135 }, { "epoch": 4.340184688710158, "grad_norm": 0.22636061906814575, "learning_rate": 3.484826101983031e-05, "loss": 0.7997, "num_input_tokens_seen": 16802832, "step": 29140 }, { "epoch": 4.340929401251117, "grad_norm": 0.22170062363147736, "learning_rate": 3.4842287339016997e-05, "loss": 0.8044, "num_input_tokens_seen": 16805712, "step": 29145 }, { "epoch": 4.3416741137920765, "grad_norm": 0.2792793810367584, "learning_rate": 3.483631299310193e-05, "loss": 0.8701, "num_input_tokens_seen": 16808368, "step": 29150 }, { "epoch": 4.342418826333035, "grad_norm": 0.22625596821308136, "learning_rate": 3.483033798248882e-05, "loss": 0.8086, "num_input_tokens_seen": 16811184, "step": 29155 }, { "epoch": 4.343163538873995, "grad_norm": 0.22154515981674194, "learning_rate": 3.4824362307581435e-05, "loss": 0.7921, "num_input_tokens_seen": 16814032, "step": 29160 }, { "epoch": 4.343908251414954, "grad_norm": 0.22334368526935577, "learning_rate": 3.4818385968783584e-05, "loss": 0.8188, "num_input_tokens_seen": 16817072, "step": 29165 }, { "epoch": 4.344652963955913, "grad_norm": 0.19673825800418854, "learning_rate": 3.481240896649913e-05, "loss": 0.8244, "num_input_tokens_seen": 16819888, "step": 29170 }, { "epoch": 4.345397676496872, "grad_norm": 0.3162122666835785, "learning_rate": 3.4806431301131974e-05, "loss": 0.8037, "num_input_tokens_seen": 16822736, "step": 29175 }, { "epoch": 4.346142389037832, "grad_norm": 0.17973679304122925, "learning_rate": 3.480045297308606e-05, "loss": 0.7961, "num_input_tokens_seen": 16825328, "step": 29180 }, { "epoch": 4.34688710157879, "grad_norm": 0.2314731776714325, "learning_rate": 3.479447398276538e-05, "loss": 0.8221, "num_input_tokens_seen": 16827984, "step": 29185 }, { "epoch": 4.34763181411975, "grad_norm": 0.22868712246418, "learning_rate": 3.4788494330573965e-05, "loss": 0.8038, "num_input_tokens_seen": 16830512, "step": 29190 }, { "epoch": 4.348376526660709, "grad_norm": 0.372237890958786, "learning_rate": 3.478251401691591e-05, "loss": 0.8543, "num_input_tokens_seen": 16833648, "step": 29195 }, { "epoch": 4.3491212392016685, "grad_norm": 0.36298036575317383, "learning_rate": 3.4776533042195324e-05, "loss": 0.814, "num_input_tokens_seen": 16836816, "step": 29200 }, { "epoch": 4.349865951742627, "grad_norm": 0.2732745110988617, "learning_rate": 3.477055140681639e-05, "loss": 0.7949, "num_input_tokens_seen": 16839632, "step": 29205 }, { "epoch": 4.350610664283587, "grad_norm": 0.25366029143333435, "learning_rate": 3.4764569111183304e-05, "loss": 0.7993, "num_input_tokens_seen": 16842512, "step": 29210 }, { "epoch": 4.351355376824546, "grad_norm": 0.2212938666343689, "learning_rate": 3.475858615570035e-05, "loss": 0.8081, "num_input_tokens_seen": 16845200, "step": 29215 }, { "epoch": 4.352100089365505, "grad_norm": 0.23040206730365753, "learning_rate": 3.475260254077181e-05, "loss": 0.7972, "num_input_tokens_seen": 16848016, "step": 29220 }, { "epoch": 4.352844801906464, "grad_norm": 0.2691592276096344, "learning_rate": 3.474661826680204e-05, "loss": 0.7739, "num_input_tokens_seen": 16850800, "step": 29225 }, { "epoch": 4.353589514447424, "grad_norm": 0.2841445803642273, "learning_rate": 3.474063333419544e-05, "loss": 0.7803, "num_input_tokens_seen": 16853680, "step": 29230 }, { "epoch": 4.3543342269883825, "grad_norm": 0.27424758672714233, "learning_rate": 3.473464774335644e-05, "loss": 0.8055, "num_input_tokens_seen": 16856560, "step": 29235 }, { "epoch": 4.355078939529342, "grad_norm": 0.2036643624305725, "learning_rate": 3.472866149468953e-05, "loss": 0.8218, "num_input_tokens_seen": 16859440, "step": 29240 }, { "epoch": 4.355823652070301, "grad_norm": 0.3252439796924591, "learning_rate": 3.472267458859922e-05, "loss": 0.8105, "num_input_tokens_seen": 16862288, "step": 29245 }, { "epoch": 4.35656836461126, "grad_norm": 0.17258642613887787, "learning_rate": 3.47166870254901e-05, "loss": 0.7968, "num_input_tokens_seen": 16865008, "step": 29250 }, { "epoch": 4.357313077152219, "grad_norm": 0.30288010835647583, "learning_rate": 3.471069880576677e-05, "loss": 0.7955, "num_input_tokens_seen": 16867920, "step": 29255 }, { "epoch": 4.358057789693178, "grad_norm": 0.19817419350147247, "learning_rate": 3.470470992983389e-05, "loss": 0.7827, "num_input_tokens_seen": 16870736, "step": 29260 }, { "epoch": 4.358802502234138, "grad_norm": 0.2263636589050293, "learning_rate": 3.4698720398096176e-05, "loss": 0.7704, "num_input_tokens_seen": 16873552, "step": 29265 }, { "epoch": 4.359547214775096, "grad_norm": 0.20797480642795563, "learning_rate": 3.4692730210958376e-05, "loss": 0.8154, "num_input_tokens_seen": 16876720, "step": 29270 }, { "epoch": 4.360291927316056, "grad_norm": 0.28892937302589417, "learning_rate": 3.468673936882527e-05, "loss": 0.8124, "num_input_tokens_seen": 16879632, "step": 29275 }, { "epoch": 4.361036639857015, "grad_norm": 0.21968600153923035, "learning_rate": 3.46807478721017e-05, "loss": 0.8241, "num_input_tokens_seen": 16882576, "step": 29280 }, { "epoch": 4.3617813523979745, "grad_norm": 0.21081429719924927, "learning_rate": 3.4674755721192555e-05, "loss": 0.7772, "num_input_tokens_seen": 16885392, "step": 29285 }, { "epoch": 4.362526064938933, "grad_norm": 0.3816879987716675, "learning_rate": 3.466876291650274e-05, "loss": 0.8013, "num_input_tokens_seen": 16888272, "step": 29290 }, { "epoch": 4.363270777479893, "grad_norm": 0.2564900815486908, "learning_rate": 3.466276945843725e-05, "loss": 0.7834, "num_input_tokens_seen": 16891248, "step": 29295 }, { "epoch": 4.364015490020852, "grad_norm": 0.21651962399482727, "learning_rate": 3.465677534740107e-05, "loss": 0.7905, "num_input_tokens_seen": 16894320, "step": 29300 }, { "epoch": 4.364760202561811, "grad_norm": 0.2183804214000702, "learning_rate": 3.4650780583799294e-05, "loss": 0.7799, "num_input_tokens_seen": 16897072, "step": 29305 }, { "epoch": 4.36550491510277, "grad_norm": 0.21668405830860138, "learning_rate": 3.464478516803699e-05, "loss": 0.7976, "num_input_tokens_seen": 16899888, "step": 29310 }, { "epoch": 4.36624962764373, "grad_norm": 0.2674563527107239, "learning_rate": 3.463878910051932e-05, "loss": 0.8163, "num_input_tokens_seen": 16902832, "step": 29315 }, { "epoch": 4.3669943401846885, "grad_norm": 0.29329806566238403, "learning_rate": 3.4632792381651473e-05, "loss": 0.7993, "num_input_tokens_seen": 16905456, "step": 29320 }, { "epoch": 4.367739052725648, "grad_norm": 0.20647816359996796, "learning_rate": 3.462679501183867e-05, "loss": 0.7951, "num_input_tokens_seen": 16908400, "step": 29325 }, { "epoch": 4.368483765266607, "grad_norm": 0.31390655040740967, "learning_rate": 3.462079699148622e-05, "loss": 0.824, "num_input_tokens_seen": 16911248, "step": 29330 }, { "epoch": 4.3692284778075665, "grad_norm": 0.18430711328983307, "learning_rate": 3.4614798320999406e-05, "loss": 0.7467, "num_input_tokens_seen": 16914096, "step": 29335 }, { "epoch": 4.369973190348525, "grad_norm": 0.22802747786045074, "learning_rate": 3.4608799000783624e-05, "loss": 0.8022, "num_input_tokens_seen": 16917008, "step": 29340 }, { "epoch": 4.370717902889485, "grad_norm": 0.28254562616348267, "learning_rate": 3.460279903124427e-05, "loss": 0.8415, "num_input_tokens_seen": 16919728, "step": 29345 }, { "epoch": 4.371462615430444, "grad_norm": 0.19205023348331451, "learning_rate": 3.45967984127868e-05, "loss": 0.7807, "num_input_tokens_seen": 16922544, "step": 29350 }, { "epoch": 4.372207327971403, "grad_norm": 0.24365176260471344, "learning_rate": 3.4590797145816714e-05, "loss": 0.7882, "num_input_tokens_seen": 16925584, "step": 29355 }, { "epoch": 4.372952040512362, "grad_norm": 0.21324841678142548, "learning_rate": 3.4584795230739535e-05, "loss": 0.7854, "num_input_tokens_seen": 16928400, "step": 29360 }, { "epoch": 4.373696753053322, "grad_norm": 0.22899362444877625, "learning_rate": 3.457879266796087e-05, "loss": 0.7905, "num_input_tokens_seen": 16931312, "step": 29365 }, { "epoch": 4.3744414655942805, "grad_norm": 0.24107593297958374, "learning_rate": 3.457278945788635e-05, "loss": 0.7889, "num_input_tokens_seen": 16934160, "step": 29370 }, { "epoch": 4.37518617813524, "grad_norm": 0.209456667304039, "learning_rate": 3.456678560092164e-05, "loss": 0.8073, "num_input_tokens_seen": 16937136, "step": 29375 }, { "epoch": 4.375930890676199, "grad_norm": 0.21170005202293396, "learning_rate": 3.4560781097472436e-05, "loss": 0.7947, "num_input_tokens_seen": 16939888, "step": 29380 }, { "epoch": 4.3766756032171585, "grad_norm": 0.17707480490207672, "learning_rate": 3.455477594794454e-05, "loss": 0.7692, "num_input_tokens_seen": 16942448, "step": 29385 }, { "epoch": 4.377420315758117, "grad_norm": 0.24083729088306427, "learning_rate": 3.454877015274371e-05, "loss": 0.7921, "num_input_tokens_seen": 16945552, "step": 29390 }, { "epoch": 4.378165028299077, "grad_norm": 0.2334405481815338, "learning_rate": 3.4542763712275836e-05, "loss": 0.8096, "num_input_tokens_seen": 16948400, "step": 29395 }, { "epoch": 4.378909740840036, "grad_norm": 0.28665509819984436, "learning_rate": 3.453675662694677e-05, "loss": 0.8291, "num_input_tokens_seen": 16951376, "step": 29400 }, { "epoch": 4.379654453380995, "grad_norm": 0.24552026391029358, "learning_rate": 3.453074889716248e-05, "loss": 0.7821, "num_input_tokens_seen": 16954288, "step": 29405 }, { "epoch": 4.380399165921954, "grad_norm": 0.3024829030036926, "learning_rate": 3.452474052332891e-05, "loss": 0.8257, "num_input_tokens_seen": 16957136, "step": 29410 }, { "epoch": 4.381143878462913, "grad_norm": 0.2681923806667328, "learning_rate": 3.451873150585212e-05, "loss": 0.788, "num_input_tokens_seen": 16960208, "step": 29415 }, { "epoch": 4.3818885910038725, "grad_norm": 0.22834111750125885, "learning_rate": 3.451272184513815e-05, "loss": 0.7988, "num_input_tokens_seen": 16963312, "step": 29420 }, { "epoch": 4.382633303544832, "grad_norm": 0.1991887390613556, "learning_rate": 3.4506711541593107e-05, "loss": 0.8202, "num_input_tokens_seen": 16966032, "step": 29425 }, { "epoch": 4.383378016085791, "grad_norm": 0.18537279963493347, "learning_rate": 3.450070059562315e-05, "loss": 0.8194, "num_input_tokens_seen": 16968880, "step": 29430 }, { "epoch": 4.38412272862675, "grad_norm": 0.17949216067790985, "learning_rate": 3.449468900763448e-05, "loss": 0.8297, "num_input_tokens_seen": 16971536, "step": 29435 }, { "epoch": 4.384867441167709, "grad_norm": 0.25067299604415894, "learning_rate": 3.448867677803333e-05, "loss": 0.8257, "num_input_tokens_seen": 16974480, "step": 29440 }, { "epoch": 4.385612153708668, "grad_norm": 0.2673461437225342, "learning_rate": 3.4482663907225975e-05, "loss": 0.8362, "num_input_tokens_seen": 16977584, "step": 29445 }, { "epoch": 4.386356866249628, "grad_norm": 0.2707337737083435, "learning_rate": 3.447665039561875e-05, "loss": 0.7954, "num_input_tokens_seen": 16980592, "step": 29450 }, { "epoch": 4.3871015787905865, "grad_norm": 0.15702122449874878, "learning_rate": 3.4470636243618026e-05, "loss": 0.7833, "num_input_tokens_seen": 16983408, "step": 29455 }, { "epoch": 4.387846291331546, "grad_norm": 0.2987138330936432, "learning_rate": 3.44646214516302e-05, "loss": 0.791, "num_input_tokens_seen": 16986384, "step": 29460 }, { "epoch": 4.388591003872505, "grad_norm": 0.24156008660793304, "learning_rate": 3.4458606020061744e-05, "loss": 0.8027, "num_input_tokens_seen": 16989200, "step": 29465 }, { "epoch": 4.3893357164134645, "grad_norm": 0.2470889687538147, "learning_rate": 3.445258994931915e-05, "loss": 0.7939, "num_input_tokens_seen": 16991760, "step": 29470 }, { "epoch": 4.390080428954423, "grad_norm": 0.21049033105373383, "learning_rate": 3.444657323980895e-05, "loss": 0.7886, "num_input_tokens_seen": 16994576, "step": 29475 }, { "epoch": 4.390825141495383, "grad_norm": 0.20777127146720886, "learning_rate": 3.444055589193774e-05, "loss": 0.8021, "num_input_tokens_seen": 16997296, "step": 29480 }, { "epoch": 4.391569854036342, "grad_norm": 0.2671550214290619, "learning_rate": 3.443453790611215e-05, "loss": 0.8147, "num_input_tokens_seen": 17000112, "step": 29485 }, { "epoch": 4.392314566577301, "grad_norm": 0.19582444429397583, "learning_rate": 3.442851928273884e-05, "loss": 0.7945, "num_input_tokens_seen": 17003280, "step": 29490 }, { "epoch": 4.39305927911826, "grad_norm": 0.3083096146583557, "learning_rate": 3.4422500022224536e-05, "loss": 0.8387, "num_input_tokens_seen": 17006096, "step": 29495 }, { "epoch": 4.39380399165922, "grad_norm": 0.25140881538391113, "learning_rate": 3.4416480124975995e-05, "loss": 0.8428, "num_input_tokens_seen": 17009136, "step": 29500 }, { "epoch": 4.3945487042001785, "grad_norm": 0.27139538526535034, "learning_rate": 3.44104595914e-05, "loss": 0.794, "num_input_tokens_seen": 17012272, "step": 29505 }, { "epoch": 4.395293416741138, "grad_norm": 0.17892983555793762, "learning_rate": 3.440443842190341e-05, "loss": 0.8109, "num_input_tokens_seen": 17014832, "step": 29510 }, { "epoch": 4.396038129282097, "grad_norm": 0.20576657354831696, "learning_rate": 3.439841661689311e-05, "loss": 0.8026, "num_input_tokens_seen": 17017456, "step": 29515 }, { "epoch": 4.396782841823057, "grad_norm": 0.22840584814548492, "learning_rate": 3.439239417677602e-05, "loss": 0.8141, "num_input_tokens_seen": 17020400, "step": 29520 }, { "epoch": 4.397527554364015, "grad_norm": 0.19423125684261322, "learning_rate": 3.4386371101959125e-05, "loss": 0.7889, "num_input_tokens_seen": 17023248, "step": 29525 }, { "epoch": 4.398272266904975, "grad_norm": 0.22756725549697876, "learning_rate": 3.4380347392849424e-05, "loss": 0.7709, "num_input_tokens_seen": 17026128, "step": 29530 }, { "epoch": 4.399016979445934, "grad_norm": 0.25487586855888367, "learning_rate": 3.4374323049854e-05, "loss": 0.8156, "num_input_tokens_seen": 17028880, "step": 29535 }, { "epoch": 4.399761691986893, "grad_norm": 0.17714384198188782, "learning_rate": 3.436829807337992e-05, "loss": 0.7881, "num_input_tokens_seen": 17031792, "step": 29540 }, { "epoch": 4.400506404527852, "grad_norm": 0.3166598975658417, "learning_rate": 3.436227246383435e-05, "loss": 0.8053, "num_input_tokens_seen": 17034576, "step": 29545 }, { "epoch": 4.401251117068812, "grad_norm": 0.3178851902484894, "learning_rate": 3.435624622162448e-05, "loss": 0.8255, "num_input_tokens_seen": 17037552, "step": 29550 }, { "epoch": 4.4019958296097705, "grad_norm": 0.20931798219680786, "learning_rate": 3.435021934715752e-05, "loss": 0.7981, "num_input_tokens_seen": 17040400, "step": 29555 }, { "epoch": 4.40274054215073, "grad_norm": 0.3365322947502136, "learning_rate": 3.4344191840840755e-05, "loss": 0.809, "num_input_tokens_seen": 17043120, "step": 29560 }, { "epoch": 4.403485254691689, "grad_norm": 0.177287295460701, "learning_rate": 3.4338163703081495e-05, "loss": 0.8079, "num_input_tokens_seen": 17046064, "step": 29565 }, { "epoch": 4.404229967232649, "grad_norm": 0.22053441405296326, "learning_rate": 3.43321349342871e-05, "loss": 0.8066, "num_input_tokens_seen": 17048976, "step": 29570 }, { "epoch": 4.404974679773607, "grad_norm": 0.2062794715166092, "learning_rate": 3.432610553486497e-05, "loss": 0.7902, "num_input_tokens_seen": 17051600, "step": 29575 }, { "epoch": 4.405719392314566, "grad_norm": 0.171676903963089, "learning_rate": 3.432007550522254e-05, "loss": 0.8165, "num_input_tokens_seen": 17054512, "step": 29580 }, { "epoch": 4.406464104855526, "grad_norm": 0.21592402458190918, "learning_rate": 3.431404484576731e-05, "loss": 0.7963, "num_input_tokens_seen": 17057168, "step": 29585 }, { "epoch": 4.407208817396485, "grad_norm": 0.22583723068237305, "learning_rate": 3.430801355690679e-05, "loss": 0.818, "num_input_tokens_seen": 17060016, "step": 29590 }, { "epoch": 4.407953529937444, "grad_norm": 0.2587546408176422, "learning_rate": 3.430198163904855e-05, "loss": 0.8135, "num_input_tokens_seen": 17062736, "step": 29595 }, { "epoch": 4.408698242478403, "grad_norm": 0.2211652249097824, "learning_rate": 3.429594909260023e-05, "loss": 0.8157, "num_input_tokens_seen": 17065456, "step": 29600 }, { "epoch": 4.409442955019363, "grad_norm": 0.19889146089553833, "learning_rate": 3.428991591796944e-05, "loss": 0.7972, "num_input_tokens_seen": 17068560, "step": 29605 }, { "epoch": 4.410187667560321, "grad_norm": 0.32578298449516296, "learning_rate": 3.428388211556391e-05, "loss": 0.7998, "num_input_tokens_seen": 17071472, "step": 29610 }, { "epoch": 4.410932380101281, "grad_norm": 0.165760338306427, "learning_rate": 3.4277847685791384e-05, "loss": 0.7943, "num_input_tokens_seen": 17074480, "step": 29615 }, { "epoch": 4.41167709264224, "grad_norm": 0.23251737654209137, "learning_rate": 3.427181262905963e-05, "loss": 0.8091, "num_input_tokens_seen": 17077328, "step": 29620 }, { "epoch": 4.412421805183199, "grad_norm": 0.21930663287639618, "learning_rate": 3.4265776945776464e-05, "loss": 0.794, "num_input_tokens_seen": 17080368, "step": 29625 }, { "epoch": 4.413166517724158, "grad_norm": 0.2732168734073639, "learning_rate": 3.425974063634977e-05, "loss": 0.8216, "num_input_tokens_seen": 17083088, "step": 29630 }, { "epoch": 4.413911230265118, "grad_norm": 0.29512518644332886, "learning_rate": 3.4253703701187455e-05, "loss": 0.8114, "num_input_tokens_seen": 17086160, "step": 29635 }, { "epoch": 4.4146559428060765, "grad_norm": 0.22423124313354492, "learning_rate": 3.4247666140697466e-05, "loss": 0.7891, "num_input_tokens_seen": 17089008, "step": 29640 }, { "epoch": 4.415400655347036, "grad_norm": 0.23890826106071472, "learning_rate": 3.424162795528779e-05, "loss": 0.8105, "num_input_tokens_seen": 17091888, "step": 29645 }, { "epoch": 4.416145367887995, "grad_norm": 0.2429228276014328, "learning_rate": 3.423558914536648e-05, "loss": 0.7929, "num_input_tokens_seen": 17094768, "step": 29650 }, { "epoch": 4.416890080428955, "grad_norm": 0.29061567783355713, "learning_rate": 3.42295497113416e-05, "loss": 0.7984, "num_input_tokens_seen": 17097616, "step": 29655 }, { "epoch": 4.417634792969913, "grad_norm": 0.29759612679481506, "learning_rate": 3.4223509653621275e-05, "loss": 0.8311, "num_input_tokens_seen": 17100432, "step": 29660 }, { "epoch": 4.418379505510873, "grad_norm": 0.23181559145450592, "learning_rate": 3.421746897261367e-05, "loss": 0.778, "num_input_tokens_seen": 17103248, "step": 29665 }, { "epoch": 4.419124218051832, "grad_norm": 0.30045291781425476, "learning_rate": 3.421142766872698e-05, "loss": 0.8098, "num_input_tokens_seen": 17106256, "step": 29670 }, { "epoch": 4.419868930592791, "grad_norm": 0.2981318235397339, "learning_rate": 3.420538574236946e-05, "loss": 0.8057, "num_input_tokens_seen": 17109168, "step": 29675 }, { "epoch": 4.42061364313375, "grad_norm": 0.17836277186870575, "learning_rate": 3.4199343193949404e-05, "loss": 0.7844, "num_input_tokens_seen": 17112048, "step": 29680 }, { "epoch": 4.42135835567471, "grad_norm": 0.19987577199935913, "learning_rate": 3.419330002387514e-05, "loss": 0.7865, "num_input_tokens_seen": 17114832, "step": 29685 }, { "epoch": 4.422103068215669, "grad_norm": 0.2890850603580475, "learning_rate": 3.418725623255503e-05, "loss": 0.7967, "num_input_tokens_seen": 17117360, "step": 29690 }, { "epoch": 4.422847780756628, "grad_norm": 0.22495460510253906, "learning_rate": 3.418121182039749e-05, "loss": 0.817, "num_input_tokens_seen": 17120464, "step": 29695 }, { "epoch": 4.423592493297587, "grad_norm": 0.18619653582572937, "learning_rate": 3.4175166787811004e-05, "loss": 0.8061, "num_input_tokens_seen": 17123312, "step": 29700 }, { "epoch": 4.424337205838547, "grad_norm": 0.27259138226509094, "learning_rate": 3.416912113520403e-05, "loss": 0.8441, "num_input_tokens_seen": 17126192, "step": 29705 }, { "epoch": 4.425081918379505, "grad_norm": 0.2550402581691742, "learning_rate": 3.416307486298513e-05, "loss": 0.8418, "num_input_tokens_seen": 17128848, "step": 29710 }, { "epoch": 4.425826630920465, "grad_norm": 0.2234872579574585, "learning_rate": 3.4157027971562897e-05, "loss": 0.8332, "num_input_tokens_seen": 17131792, "step": 29715 }, { "epoch": 4.426571343461424, "grad_norm": 0.21224026381969452, "learning_rate": 3.4150980461345945e-05, "loss": 0.8253, "num_input_tokens_seen": 17134416, "step": 29720 }, { "epoch": 4.427316056002383, "grad_norm": 0.28666621446609497, "learning_rate": 3.414493233274293e-05, "loss": 0.8054, "num_input_tokens_seen": 17137712, "step": 29725 }, { "epoch": 4.428060768543342, "grad_norm": 0.34683895111083984, "learning_rate": 3.413888358616256e-05, "loss": 0.7979, "num_input_tokens_seen": 17140528, "step": 29730 }, { "epoch": 4.428805481084302, "grad_norm": 0.290530264377594, "learning_rate": 3.413283422201361e-05, "loss": 0.8234, "num_input_tokens_seen": 17143568, "step": 29735 }, { "epoch": 4.429550193625261, "grad_norm": 0.21466079354286194, "learning_rate": 3.412678424070485e-05, "loss": 0.8134, "num_input_tokens_seen": 17146480, "step": 29740 }, { "epoch": 4.43029490616622, "grad_norm": 0.3216652274131775, "learning_rate": 3.4120733642645114e-05, "loss": 0.8076, "num_input_tokens_seen": 17149424, "step": 29745 }, { "epoch": 4.431039618707179, "grad_norm": 0.2911416292190552, "learning_rate": 3.411468242824328e-05, "loss": 0.8141, "num_input_tokens_seen": 17152336, "step": 29750 }, { "epoch": 4.431784331248139, "grad_norm": 0.19708621501922607, "learning_rate": 3.410863059790827e-05, "loss": 0.8029, "num_input_tokens_seen": 17155152, "step": 29755 }, { "epoch": 4.432529043789097, "grad_norm": 0.24347180128097534, "learning_rate": 3.4102578152049035e-05, "loss": 0.786, "num_input_tokens_seen": 17157936, "step": 29760 }, { "epoch": 4.433273756330056, "grad_norm": 0.24541954696178436, "learning_rate": 3.4096525091074585e-05, "loss": 0.8026, "num_input_tokens_seen": 17160880, "step": 29765 }, { "epoch": 4.434018468871016, "grad_norm": 0.19924503564834595, "learning_rate": 3.409047141539394e-05, "loss": 0.7997, "num_input_tokens_seen": 17163664, "step": 29770 }, { "epoch": 4.434763181411975, "grad_norm": 0.2186986804008484, "learning_rate": 3.40844171254162e-05, "loss": 0.8251, "num_input_tokens_seen": 17166640, "step": 29775 }, { "epoch": 4.435507893952934, "grad_norm": 0.17409609258174896, "learning_rate": 3.4078362221550485e-05, "loss": 0.8116, "num_input_tokens_seen": 17169712, "step": 29780 }, { "epoch": 4.436252606493893, "grad_norm": 0.24380239844322205, "learning_rate": 3.4072306704205966e-05, "loss": 0.7823, "num_input_tokens_seen": 17172752, "step": 29785 }, { "epoch": 4.436997319034853, "grad_norm": 0.149397611618042, "learning_rate": 3.4066250573791834e-05, "loss": 0.7928, "num_input_tokens_seen": 17175216, "step": 29790 }, { "epoch": 4.437742031575811, "grad_norm": 0.24380184710025787, "learning_rate": 3.4060193830717355e-05, "loss": 0.8111, "num_input_tokens_seen": 17177936, "step": 29795 }, { "epoch": 4.438486744116771, "grad_norm": 0.33515122532844543, "learning_rate": 3.405413647539182e-05, "loss": 0.8025, "num_input_tokens_seen": 17180784, "step": 29800 }, { "epoch": 4.43923145665773, "grad_norm": 0.1962922066450119, "learning_rate": 3.404807850822455e-05, "loss": 0.7754, "num_input_tokens_seen": 17184016, "step": 29805 }, { "epoch": 4.439976169198689, "grad_norm": 0.15638570487499237, "learning_rate": 3.4042019929624916e-05, "loss": 0.8497, "num_input_tokens_seen": 17186992, "step": 29810 }, { "epoch": 4.440720881739648, "grad_norm": 0.19075411558151245, "learning_rate": 3.403596074000234e-05, "loss": 0.7969, "num_input_tokens_seen": 17190096, "step": 29815 }, { "epoch": 4.441465594280608, "grad_norm": 0.18476583063602448, "learning_rate": 3.402990093976628e-05, "loss": 0.8323, "num_input_tokens_seen": 17193040, "step": 29820 }, { "epoch": 4.442210306821567, "grad_norm": 0.2158995419740677, "learning_rate": 3.402384052932622e-05, "loss": 0.7867, "num_input_tokens_seen": 17195856, "step": 29825 }, { "epoch": 4.442955019362526, "grad_norm": 0.22052329778671265, "learning_rate": 3.4017779509091705e-05, "loss": 0.8105, "num_input_tokens_seen": 17198800, "step": 29830 }, { "epoch": 4.443699731903485, "grad_norm": 0.18684203922748566, "learning_rate": 3.4011717879472315e-05, "loss": 0.7995, "num_input_tokens_seen": 17201776, "step": 29835 }, { "epoch": 4.444444444444445, "grad_norm": 0.2853511571884155, "learning_rate": 3.400565564087767e-05, "loss": 0.7972, "num_input_tokens_seen": 17204464, "step": 29840 }, { "epoch": 4.445189156985403, "grad_norm": 0.19592291116714478, "learning_rate": 3.399959279371743e-05, "loss": 0.8116, "num_input_tokens_seen": 17207280, "step": 29845 }, { "epoch": 4.445933869526363, "grad_norm": 0.18967176973819733, "learning_rate": 3.399352933840131e-05, "loss": 0.7642, "num_input_tokens_seen": 17210032, "step": 29850 }, { "epoch": 4.446678582067322, "grad_norm": 0.3136470913887024, "learning_rate": 3.3987465275339034e-05, "loss": 0.8081, "num_input_tokens_seen": 17212720, "step": 29855 }, { "epoch": 4.4474232946082815, "grad_norm": 0.18496648967266083, "learning_rate": 3.3981400604940393e-05, "loss": 0.8009, "num_input_tokens_seen": 17215344, "step": 29860 }, { "epoch": 4.44816800714924, "grad_norm": 0.25076645612716675, "learning_rate": 3.397533532761522e-05, "loss": 0.8333, "num_input_tokens_seen": 17218320, "step": 29865 }, { "epoch": 4.4489127196902, "grad_norm": 0.30023840069770813, "learning_rate": 3.3969269443773364e-05, "loss": 0.7806, "num_input_tokens_seen": 17221264, "step": 29870 }, { "epoch": 4.449657432231159, "grad_norm": 0.23720811307430267, "learning_rate": 3.396320295382476e-05, "loss": 0.8022, "num_input_tokens_seen": 17224304, "step": 29875 }, { "epoch": 4.450402144772118, "grad_norm": 0.27149590849876404, "learning_rate": 3.3957135858179335e-05, "loss": 0.8083, "num_input_tokens_seen": 17227088, "step": 29880 }, { "epoch": 4.451146857313077, "grad_norm": 0.19157588481903076, "learning_rate": 3.395106815724709e-05, "loss": 0.7763, "num_input_tokens_seen": 17229776, "step": 29885 }, { "epoch": 4.451891569854037, "grad_norm": 0.20071248710155487, "learning_rate": 3.3944999851438045e-05, "loss": 0.756, "num_input_tokens_seen": 17232752, "step": 29890 }, { "epoch": 4.452636282394995, "grad_norm": 0.23625044524669647, "learning_rate": 3.3938930941162285e-05, "loss": 0.8015, "num_input_tokens_seen": 17235856, "step": 29895 }, { "epoch": 4.453380994935955, "grad_norm": 0.22279630601406097, "learning_rate": 3.393286142682991e-05, "loss": 0.8049, "num_input_tokens_seen": 17238992, "step": 29900 }, { "epoch": 4.454125707476914, "grad_norm": 0.2832948565483093, "learning_rate": 3.392679130885108e-05, "loss": 0.7859, "num_input_tokens_seen": 17242064, "step": 29905 }, { "epoch": 4.4548704200178735, "grad_norm": 0.23611319065093994, "learning_rate": 3.392072058763598e-05, "loss": 0.8054, "num_input_tokens_seen": 17244816, "step": 29910 }, { "epoch": 4.455615132558832, "grad_norm": 0.25616931915283203, "learning_rate": 3.391464926359487e-05, "loss": 0.7961, "num_input_tokens_seen": 17247984, "step": 29915 }, { "epoch": 4.456359845099792, "grad_norm": 0.2790718078613281, "learning_rate": 3.390857733713799e-05, "loss": 0.8428, "num_input_tokens_seen": 17250992, "step": 29920 }, { "epoch": 4.457104557640751, "grad_norm": 0.19562330842018127, "learning_rate": 3.3902504808675684e-05, "loss": 0.802, "num_input_tokens_seen": 17254128, "step": 29925 }, { "epoch": 4.457849270181709, "grad_norm": 0.17488525807857513, "learning_rate": 3.389643167861829e-05, "loss": 0.7886, "num_input_tokens_seen": 17257200, "step": 29930 }, { "epoch": 4.458593982722669, "grad_norm": 0.2114408016204834, "learning_rate": 3.3890357947376216e-05, "loss": 0.7901, "num_input_tokens_seen": 17260112, "step": 29935 }, { "epoch": 4.459338695263629, "grad_norm": 0.43618226051330566, "learning_rate": 3.38842836153599e-05, "loss": 0.8382, "num_input_tokens_seen": 17263024, "step": 29940 }, { "epoch": 4.4600834078045875, "grad_norm": 0.18881921470165253, "learning_rate": 3.3878208682979815e-05, "loss": 0.7827, "num_input_tokens_seen": 17265712, "step": 29945 }, { "epoch": 4.460828120345546, "grad_norm": 0.34595972299575806, "learning_rate": 3.3872133150646484e-05, "loss": 0.8073, "num_input_tokens_seen": 17268688, "step": 29950 }, { "epoch": 4.461572832886506, "grad_norm": 0.21020013093948364, "learning_rate": 3.386605701877047e-05, "loss": 0.8082, "num_input_tokens_seen": 17271824, "step": 29955 }, { "epoch": 4.462317545427465, "grad_norm": 0.18601030111312866, "learning_rate": 3.3859980287762364e-05, "loss": 0.7908, "num_input_tokens_seen": 17274832, "step": 29960 }, { "epoch": 4.463062257968424, "grad_norm": 0.18622341752052307, "learning_rate": 3.385390295803281e-05, "loss": 0.817, "num_input_tokens_seen": 17277776, "step": 29965 }, { "epoch": 4.463806970509383, "grad_norm": 0.2685548663139343, "learning_rate": 3.3847825029992495e-05, "loss": 0.8223, "num_input_tokens_seen": 17280240, "step": 29970 }, { "epoch": 4.464551683050343, "grad_norm": 0.22744496166706085, "learning_rate": 3.384174650405213e-05, "loss": 0.8033, "num_input_tokens_seen": 17283184, "step": 29975 }, { "epoch": 4.465296395591301, "grad_norm": 0.2602878212928772, "learning_rate": 3.3835667380622497e-05, "loss": 0.847, "num_input_tokens_seen": 17285936, "step": 29980 }, { "epoch": 4.466041108132261, "grad_norm": 0.21350669860839844, "learning_rate": 3.382958766011439e-05, "loss": 0.7491, "num_input_tokens_seen": 17289008, "step": 29985 }, { "epoch": 4.46678582067322, "grad_norm": 0.22179266810417175, "learning_rate": 3.3823507342938634e-05, "loss": 0.794, "num_input_tokens_seen": 17292080, "step": 29990 }, { "epoch": 4.4675305332141795, "grad_norm": 0.25913217663764954, "learning_rate": 3.381742642950612e-05, "loss": 0.8151, "num_input_tokens_seen": 17295056, "step": 29995 }, { "epoch": 4.468275245755138, "grad_norm": 0.218468576669693, "learning_rate": 3.3811344920227795e-05, "loss": 0.7522, "num_input_tokens_seen": 17298224, "step": 30000 }, { "epoch": 4.469019958296098, "grad_norm": 0.2516282796859741, "learning_rate": 3.3805262815514596e-05, "loss": 0.8088, "num_input_tokens_seen": 17300976, "step": 30005 }, { "epoch": 4.469764670837057, "grad_norm": 0.20174792408943176, "learning_rate": 3.379918011577753e-05, "loss": 0.8273, "num_input_tokens_seen": 17304048, "step": 30010 }, { "epoch": 4.470509383378016, "grad_norm": 0.25291281938552856, "learning_rate": 3.379309682142766e-05, "loss": 0.7966, "num_input_tokens_seen": 17306896, "step": 30015 }, { "epoch": 4.471254095918975, "grad_norm": 0.18720223009586334, "learning_rate": 3.3787012932876036e-05, "loss": 0.786, "num_input_tokens_seen": 17309776, "step": 30020 }, { "epoch": 4.471998808459935, "grad_norm": 0.2593953013420105, "learning_rate": 3.378092845053382e-05, "loss": 0.8068, "num_input_tokens_seen": 17312784, "step": 30025 }, { "epoch": 4.4727435210008935, "grad_norm": 0.21408319473266602, "learning_rate": 3.377484337481216e-05, "loss": 0.8037, "num_input_tokens_seen": 17315440, "step": 30030 }, { "epoch": 4.473488233541853, "grad_norm": 0.2524590492248535, "learning_rate": 3.376875770612226e-05, "loss": 0.7873, "num_input_tokens_seen": 17318256, "step": 30035 }, { "epoch": 4.474232946082812, "grad_norm": 0.2516424357891083, "learning_rate": 3.376267144487535e-05, "loss": 0.8147, "num_input_tokens_seen": 17321168, "step": 30040 }, { "epoch": 4.4749776586237715, "grad_norm": 0.2008659392595291, "learning_rate": 3.375658459148275e-05, "loss": 0.7952, "num_input_tokens_seen": 17324112, "step": 30045 }, { "epoch": 4.47572237116473, "grad_norm": 0.22655944526195526, "learning_rate": 3.375049714635577e-05, "loss": 0.8431, "num_input_tokens_seen": 17327152, "step": 30050 }, { "epoch": 4.47646708370569, "grad_norm": 0.27342018485069275, "learning_rate": 3.374440910990574e-05, "loss": 0.7951, "num_input_tokens_seen": 17330224, "step": 30055 }, { "epoch": 4.477211796246649, "grad_norm": 0.26918137073516846, "learning_rate": 3.3738320482544116e-05, "loss": 0.7954, "num_input_tokens_seen": 17333104, "step": 30060 }, { "epoch": 4.477956508787608, "grad_norm": 0.2213192582130432, "learning_rate": 3.3732231264682326e-05, "loss": 0.8063, "num_input_tokens_seen": 17335824, "step": 30065 }, { "epoch": 4.478701221328567, "grad_norm": 0.2841854989528656, "learning_rate": 3.3726141456731835e-05, "loss": 0.8065, "num_input_tokens_seen": 17338544, "step": 30070 }, { "epoch": 4.479445933869527, "grad_norm": 0.29698607325553894, "learning_rate": 3.3720051059104186e-05, "loss": 0.8466, "num_input_tokens_seen": 17341488, "step": 30075 }, { "epoch": 4.4801906464104855, "grad_norm": 0.27569055557250977, "learning_rate": 3.371396007221094e-05, "loss": 0.7957, "num_input_tokens_seen": 17344688, "step": 30080 }, { "epoch": 4.480935358951445, "grad_norm": 0.21619053184986115, "learning_rate": 3.3707868496463705e-05, "loss": 0.8087, "num_input_tokens_seen": 17347472, "step": 30085 }, { "epoch": 4.481680071492404, "grad_norm": 0.2109423726797104, "learning_rate": 3.3701776332274116e-05, "loss": 0.7942, "num_input_tokens_seen": 17350448, "step": 30090 }, { "epoch": 4.4824247840333635, "grad_norm": 0.25547659397125244, "learning_rate": 3.3695683580053865e-05, "loss": 0.7962, "num_input_tokens_seen": 17353328, "step": 30095 }, { "epoch": 4.483169496574322, "grad_norm": 0.2688533365726471, "learning_rate": 3.368959024021467e-05, "loss": 0.8122, "num_input_tokens_seen": 17356112, "step": 30100 }, { "epoch": 4.483914209115282, "grad_norm": 0.19138197600841522, "learning_rate": 3.3683496313168294e-05, "loss": 0.7903, "num_input_tokens_seen": 17358896, "step": 30105 }, { "epoch": 4.484658921656241, "grad_norm": 0.20962932705879211, "learning_rate": 3.367740179932655e-05, "loss": 0.7921, "num_input_tokens_seen": 17361680, "step": 30110 }, { "epoch": 4.4854036341971995, "grad_norm": 0.2697218656539917, "learning_rate": 3.3671306699101266e-05, "loss": 0.8029, "num_input_tokens_seen": 17364432, "step": 30115 }, { "epoch": 4.486148346738159, "grad_norm": 0.14651785790920258, "learning_rate": 3.3665211012904324e-05, "loss": 0.8122, "num_input_tokens_seen": 17367152, "step": 30120 }, { "epoch": 4.486893059279118, "grad_norm": 0.22375549376010895, "learning_rate": 3.365911474114766e-05, "loss": 0.7742, "num_input_tokens_seen": 17370192, "step": 30125 }, { "epoch": 4.4876377718200775, "grad_norm": 0.1650152951478958, "learning_rate": 3.3653017884243224e-05, "loss": 0.7782, "num_input_tokens_seen": 17373136, "step": 30130 }, { "epoch": 4.488382484361036, "grad_norm": 0.14306508004665375, "learning_rate": 3.364692044260302e-05, "loss": 0.7886, "num_input_tokens_seen": 17375920, "step": 30135 }, { "epoch": 4.489127196901996, "grad_norm": 0.23851193487644196, "learning_rate": 3.3640822416639086e-05, "loss": 0.7793, "num_input_tokens_seen": 17378672, "step": 30140 }, { "epoch": 4.489871909442955, "grad_norm": 0.18391799926757812, "learning_rate": 3.363472380676351e-05, "loss": 0.8112, "num_input_tokens_seen": 17381552, "step": 30145 }, { "epoch": 4.490616621983914, "grad_norm": 0.17456525564193726, "learning_rate": 3.3628624613388407e-05, "loss": 0.7838, "num_input_tokens_seen": 17384400, "step": 30150 }, { "epoch": 4.491361334524873, "grad_norm": 0.18634715676307678, "learning_rate": 3.362252483692593e-05, "loss": 0.7808, "num_input_tokens_seen": 17387152, "step": 30155 }, { "epoch": 4.492106047065833, "grad_norm": 0.17908479273319244, "learning_rate": 3.361642447778828e-05, "loss": 0.7717, "num_input_tokens_seen": 17389680, "step": 30160 }, { "epoch": 4.4928507596067915, "grad_norm": 0.18879204988479614, "learning_rate": 3.36103235363877e-05, "loss": 0.7671, "num_input_tokens_seen": 17392304, "step": 30165 }, { "epoch": 4.493595472147751, "grad_norm": 0.29641586542129517, "learning_rate": 3.360422201313646e-05, "loss": 0.8342, "num_input_tokens_seen": 17395184, "step": 30170 }, { "epoch": 4.49434018468871, "grad_norm": 0.31804314255714417, "learning_rate": 3.3598119908446866e-05, "loss": 0.7863, "num_input_tokens_seen": 17398320, "step": 30175 }, { "epoch": 4.4950848972296695, "grad_norm": 0.27550065517425537, "learning_rate": 3.3592017222731304e-05, "loss": 0.8102, "num_input_tokens_seen": 17401520, "step": 30180 }, { "epoch": 4.495829609770628, "grad_norm": 0.33487364649772644, "learning_rate": 3.358591395640215e-05, "loss": 0.817, "num_input_tokens_seen": 17404496, "step": 30185 }, { "epoch": 4.496574322311588, "grad_norm": 0.33827993273735046, "learning_rate": 3.357981010987183e-05, "loss": 0.789, "num_input_tokens_seen": 17407728, "step": 30190 }, { "epoch": 4.497319034852547, "grad_norm": 0.285087525844574, "learning_rate": 3.3573705683552824e-05, "loss": 0.8192, "num_input_tokens_seen": 17410832, "step": 30195 }, { "epoch": 4.498063747393506, "grad_norm": 0.2567318081855774, "learning_rate": 3.356760067785765e-05, "loss": 0.8112, "num_input_tokens_seen": 17413552, "step": 30200 }, { "epoch": 4.498808459934465, "grad_norm": 0.1919706016778946, "learning_rate": 3.356149509319886e-05, "loss": 0.775, "num_input_tokens_seen": 17416464, "step": 30205 }, { "epoch": 4.499553172475425, "grad_norm": 0.2830159068107605, "learning_rate": 3.355538892998904e-05, "loss": 0.7908, "num_input_tokens_seen": 17419088, "step": 30210 }, { "epoch": 4.5, "eval_loss": 0.8038442730903625, "eval_runtime": 45.3524, "eval_samples_per_second": 65.796, "eval_steps_per_second": 16.449, "num_input_tokens_seen": 17420720, "step": 30213 }, { "epoch": 4.5002978850163835, "grad_norm": 0.3130204975605011, "learning_rate": 3.3549282188640815e-05, "loss": 0.7699, "num_input_tokens_seen": 17421968, "step": 30215 }, { "epoch": 4.501042597557343, "grad_norm": 0.18921497464179993, "learning_rate": 3.354317486956685e-05, "loss": 0.776, "num_input_tokens_seen": 17424752, "step": 30220 }, { "epoch": 4.501787310098302, "grad_norm": 0.22012145817279816, "learning_rate": 3.353706697317988e-05, "loss": 0.7912, "num_input_tokens_seen": 17427728, "step": 30225 }, { "epoch": 4.5025320226392616, "grad_norm": 0.30191314220428467, "learning_rate": 3.353095849989262e-05, "loss": 0.8076, "num_input_tokens_seen": 17430800, "step": 30230 }, { "epoch": 4.50327673518022, "grad_norm": 0.21480250358581543, "learning_rate": 3.352484945011787e-05, "loss": 0.813, "num_input_tokens_seen": 17433776, "step": 30235 }, { "epoch": 4.50402144772118, "grad_norm": 0.2274184674024582, "learning_rate": 3.351873982426846e-05, "loss": 0.7968, "num_input_tokens_seen": 17436752, "step": 30240 }, { "epoch": 4.504766160262139, "grad_norm": 0.3357594609260559, "learning_rate": 3.3512629622757245e-05, "loss": 0.7859, "num_input_tokens_seen": 17439952, "step": 30245 }, { "epoch": 4.505510872803098, "grad_norm": 0.20225432515144348, "learning_rate": 3.350651884599713e-05, "loss": 0.7761, "num_input_tokens_seen": 17442768, "step": 30250 }, { "epoch": 4.506255585344057, "grad_norm": 0.2260321080684662, "learning_rate": 3.350040749440105e-05, "loss": 0.8244, "num_input_tokens_seen": 17445712, "step": 30255 }, { "epoch": 4.507000297885017, "grad_norm": 0.174652561545372, "learning_rate": 3.3494295568382006e-05, "loss": 0.7799, "num_input_tokens_seen": 17448816, "step": 30260 }, { "epoch": 4.5077450104259755, "grad_norm": 0.20659463107585907, "learning_rate": 3.348818306835299e-05, "loss": 0.8373, "num_input_tokens_seen": 17451728, "step": 30265 }, { "epoch": 4.508489722966935, "grad_norm": 0.3028360605239868, "learning_rate": 3.348206999472708e-05, "loss": 0.7859, "num_input_tokens_seen": 17454832, "step": 30270 }, { "epoch": 4.509234435507894, "grad_norm": 0.3487398624420166, "learning_rate": 3.3475956347917356e-05, "loss": 0.7798, "num_input_tokens_seen": 17457744, "step": 30275 }, { "epoch": 4.509979148048853, "grad_norm": 0.233636736869812, "learning_rate": 3.346984212833697e-05, "loss": 0.7864, "num_input_tokens_seen": 17460624, "step": 30280 }, { "epoch": 4.510723860589812, "grad_norm": 0.32891491055488586, "learning_rate": 3.346372733639909e-05, "loss": 0.7987, "num_input_tokens_seen": 17463792, "step": 30285 }, { "epoch": 4.511468573130772, "grad_norm": 0.20848938822746277, "learning_rate": 3.345761197251692e-05, "loss": 0.7891, "num_input_tokens_seen": 17466480, "step": 30290 }, { "epoch": 4.512213285671731, "grad_norm": 0.2028023898601532, "learning_rate": 3.345149603710373e-05, "loss": 0.7528, "num_input_tokens_seen": 17469040, "step": 30295 }, { "epoch": 4.5129579982126895, "grad_norm": 0.2718329429626465, "learning_rate": 3.344537953057279e-05, "loss": 0.7856, "num_input_tokens_seen": 17471920, "step": 30300 }, { "epoch": 4.513702710753649, "grad_norm": 0.17836907505989075, "learning_rate": 3.343926245333745e-05, "loss": 0.8389, "num_input_tokens_seen": 17474800, "step": 30305 }, { "epoch": 4.514447423294608, "grad_norm": 0.23405256867408752, "learning_rate": 3.343314480581104e-05, "loss": 0.7944, "num_input_tokens_seen": 17477616, "step": 30310 }, { "epoch": 4.5151921358355676, "grad_norm": 0.19844752550125122, "learning_rate": 3.342702658840702e-05, "loss": 0.7779, "num_input_tokens_seen": 17480208, "step": 30315 }, { "epoch": 4.515936848376526, "grad_norm": 0.28314706683158875, "learning_rate": 3.3420907801538784e-05, "loss": 0.8124, "num_input_tokens_seen": 17483056, "step": 30320 }, { "epoch": 4.516681560917486, "grad_norm": 0.2655097544193268, "learning_rate": 3.3414788445619844e-05, "loss": 0.7945, "num_input_tokens_seen": 17486032, "step": 30325 }, { "epoch": 4.517426273458445, "grad_norm": 0.2701096534729004, "learning_rate": 3.340866852106371e-05, "loss": 0.8551, "num_input_tokens_seen": 17488880, "step": 30330 }, { "epoch": 4.518170985999404, "grad_norm": 0.2149180769920349, "learning_rate": 3.340254802828395e-05, "loss": 0.8012, "num_input_tokens_seen": 17491600, "step": 30335 }, { "epoch": 4.518915698540363, "grad_norm": 0.31005746126174927, "learning_rate": 3.339642696769415e-05, "loss": 0.8004, "num_input_tokens_seen": 17494576, "step": 30340 }, { "epoch": 4.519660411081323, "grad_norm": 0.29260221123695374, "learning_rate": 3.339030533970796e-05, "loss": 0.8237, "num_input_tokens_seen": 17497456, "step": 30345 }, { "epoch": 4.5204051236222815, "grad_norm": 0.26699140667915344, "learning_rate": 3.338418314473904e-05, "loss": 0.7875, "num_input_tokens_seen": 17500592, "step": 30350 }, { "epoch": 4.521149836163241, "grad_norm": 0.23911145329475403, "learning_rate": 3.3378060383201116e-05, "loss": 0.7888, "num_input_tokens_seen": 17503440, "step": 30355 }, { "epoch": 4.5218945487042, "grad_norm": 0.2672087252140045, "learning_rate": 3.337193705550793e-05, "loss": 0.807, "num_input_tokens_seen": 17506544, "step": 30360 }, { "epoch": 4.52263926124516, "grad_norm": 0.25512176752090454, "learning_rate": 3.3365813162073284e-05, "loss": 0.8282, "num_input_tokens_seen": 17509680, "step": 30365 }, { "epoch": 4.523383973786118, "grad_norm": 0.22440069913864136, "learning_rate": 3.3359688703310984e-05, "loss": 0.7828, "num_input_tokens_seen": 17512848, "step": 30370 }, { "epoch": 4.524128686327078, "grad_norm": 0.24436435103416443, "learning_rate": 3.335356367963492e-05, "loss": 0.7982, "num_input_tokens_seen": 17515696, "step": 30375 }, { "epoch": 4.524873398868037, "grad_norm": 0.18934062123298645, "learning_rate": 3.334743809145898e-05, "loss": 0.803, "num_input_tokens_seen": 17518352, "step": 30380 }, { "epoch": 4.525618111408996, "grad_norm": 0.20951126515865326, "learning_rate": 3.334131193919712e-05, "loss": 0.8218, "num_input_tokens_seen": 17521072, "step": 30385 }, { "epoch": 4.526362823949955, "grad_norm": 0.18793098628520966, "learning_rate": 3.333518522326331e-05, "loss": 0.8131, "num_input_tokens_seen": 17523824, "step": 30390 }, { "epoch": 4.527107536490915, "grad_norm": 0.22632981836795807, "learning_rate": 3.3329057944071564e-05, "loss": 0.7953, "num_input_tokens_seen": 17527120, "step": 30395 }, { "epoch": 4.5278522490318736, "grad_norm": 0.34653186798095703, "learning_rate": 3.332293010203595e-05, "loss": 0.796, "num_input_tokens_seen": 17530224, "step": 30400 }, { "epoch": 4.528596961572833, "grad_norm": 0.2245648205280304, "learning_rate": 3.331680169757056e-05, "loss": 0.806, "num_input_tokens_seen": 17532944, "step": 30405 }, { "epoch": 4.529341674113792, "grad_norm": 0.21108295023441315, "learning_rate": 3.331067273108952e-05, "loss": 0.8082, "num_input_tokens_seen": 17535760, "step": 30410 }, { "epoch": 4.530086386654752, "grad_norm": 0.32359257340431213, "learning_rate": 3.330454320300701e-05, "loss": 0.8226, "num_input_tokens_seen": 17538608, "step": 30415 }, { "epoch": 4.53083109919571, "grad_norm": 0.29993945360183716, "learning_rate": 3.329841311373723e-05, "loss": 0.7769, "num_input_tokens_seen": 17541360, "step": 30420 }, { "epoch": 4.53157581173667, "grad_norm": 0.2827056050300598, "learning_rate": 3.3292282463694435e-05, "loss": 0.771, "num_input_tokens_seen": 17544112, "step": 30425 }, { "epoch": 4.532320524277629, "grad_norm": 0.2249263972043991, "learning_rate": 3.328615125329291e-05, "loss": 0.7749, "num_input_tokens_seen": 17547344, "step": 30430 }, { "epoch": 4.533065236818588, "grad_norm": 0.2024345099925995, "learning_rate": 3.328001948294695e-05, "loss": 0.8024, "num_input_tokens_seen": 17550096, "step": 30435 }, { "epoch": 4.533809949359547, "grad_norm": 0.2274583876132965, "learning_rate": 3.327388715307096e-05, "loss": 0.8241, "num_input_tokens_seen": 17553136, "step": 30440 }, { "epoch": 4.534554661900506, "grad_norm": 0.2661801874637604, "learning_rate": 3.3267754264079314e-05, "loss": 0.7864, "num_input_tokens_seen": 17556016, "step": 30445 }, { "epoch": 4.535299374441466, "grad_norm": 0.28200092911720276, "learning_rate": 3.326162081638644e-05, "loss": 0.8247, "num_input_tokens_seen": 17559056, "step": 30450 }, { "epoch": 4.536044086982425, "grad_norm": 0.280965119600296, "learning_rate": 3.3255486810406825e-05, "loss": 0.7896, "num_input_tokens_seen": 17561712, "step": 30455 }, { "epoch": 4.536788799523384, "grad_norm": 0.254401296377182, "learning_rate": 3.324935224655497e-05, "loss": 0.8135, "num_input_tokens_seen": 17564656, "step": 30460 }, { "epoch": 4.537533512064343, "grad_norm": 0.23179909586906433, "learning_rate": 3.324321712524544e-05, "loss": 0.841, "num_input_tokens_seen": 17567920, "step": 30465 }, { "epoch": 4.538278224605302, "grad_norm": 0.19891537725925446, "learning_rate": 3.32370814468928e-05, "loss": 0.7965, "num_input_tokens_seen": 17570928, "step": 30470 }, { "epoch": 4.539022937146262, "grad_norm": 0.32729125022888184, "learning_rate": 3.323094521191169e-05, "loss": 0.845, "num_input_tokens_seen": 17573840, "step": 30475 }, { "epoch": 4.539767649687221, "grad_norm": 0.21205902099609375, "learning_rate": 3.322480842071677e-05, "loss": 0.7951, "num_input_tokens_seen": 17576688, "step": 30480 }, { "epoch": 4.5405123622281796, "grad_norm": 0.19530712068080902, "learning_rate": 3.321867107372274e-05, "loss": 0.8205, "num_input_tokens_seen": 17579568, "step": 30485 }, { "epoch": 4.541257074769139, "grad_norm": 0.1515970677137375, "learning_rate": 3.321253317134432e-05, "loss": 0.7751, "num_input_tokens_seen": 17582384, "step": 30490 }, { "epoch": 4.542001787310098, "grad_norm": 0.22368891537189484, "learning_rate": 3.320639471399631e-05, "loss": 0.7778, "num_input_tokens_seen": 17585328, "step": 30495 }, { "epoch": 4.542746499851058, "grad_norm": 0.2857806086540222, "learning_rate": 3.3200255702093506e-05, "loss": 0.8157, "num_input_tokens_seen": 17588112, "step": 30500 }, { "epoch": 4.543491212392016, "grad_norm": 0.2464553415775299, "learning_rate": 3.319411613605076e-05, "loss": 0.8047, "num_input_tokens_seen": 17590928, "step": 30505 }, { "epoch": 4.544235924932976, "grad_norm": 0.29080355167388916, "learning_rate": 3.3187976016282964e-05, "loss": 0.8154, "num_input_tokens_seen": 17593904, "step": 30510 }, { "epoch": 4.544980637473935, "grad_norm": 0.26645028591156006, "learning_rate": 3.3181835343205035e-05, "loss": 0.7972, "num_input_tokens_seen": 17596752, "step": 30515 }, { "epoch": 4.545725350014894, "grad_norm": 0.25474196672439575, "learning_rate": 3.317569411723194e-05, "loss": 0.8422, "num_input_tokens_seen": 17599472, "step": 30520 }, { "epoch": 4.546470062555853, "grad_norm": 0.21583931148052216, "learning_rate": 3.316955233877868e-05, "loss": 0.7683, "num_input_tokens_seen": 17602320, "step": 30525 }, { "epoch": 4.547214775096813, "grad_norm": 0.23735864460468292, "learning_rate": 3.316341000826029e-05, "loss": 0.821, "num_input_tokens_seen": 17605328, "step": 30530 }, { "epoch": 4.547959487637772, "grad_norm": 0.21836154162883759, "learning_rate": 3.315726712609183e-05, "loss": 0.8024, "num_input_tokens_seen": 17608240, "step": 30535 }, { "epoch": 4.548704200178731, "grad_norm": 0.2438284158706665, "learning_rate": 3.3151123692688424e-05, "loss": 0.7739, "num_input_tokens_seen": 17610992, "step": 30540 }, { "epoch": 4.54944891271969, "grad_norm": 0.2107919305562973, "learning_rate": 3.3144979708465226e-05, "loss": 0.8103, "num_input_tokens_seen": 17613712, "step": 30545 }, { "epoch": 4.55019362526065, "grad_norm": 0.23681923747062683, "learning_rate": 3.313883517383741e-05, "loss": 0.7892, "num_input_tokens_seen": 17616592, "step": 30550 }, { "epoch": 4.550938337801608, "grad_norm": 0.2716189920902252, "learning_rate": 3.313269008922021e-05, "loss": 0.8237, "num_input_tokens_seen": 17619728, "step": 30555 }, { "epoch": 4.551683050342568, "grad_norm": 0.22388148307800293, "learning_rate": 3.312654445502887e-05, "loss": 0.7662, "num_input_tokens_seen": 17622576, "step": 30560 }, { "epoch": 4.552427762883527, "grad_norm": 0.20997504889965057, "learning_rate": 3.3120398271678706e-05, "loss": 0.8182, "num_input_tokens_seen": 17625360, "step": 30565 }, { "epoch": 4.553172475424486, "grad_norm": 0.20649762451648712, "learning_rate": 3.311425153958503e-05, "loss": 0.8202, "num_input_tokens_seen": 17628368, "step": 30570 }, { "epoch": 4.553917187965445, "grad_norm": 0.29029810428619385, "learning_rate": 3.310810425916323e-05, "loss": 0.7809, "num_input_tokens_seen": 17631312, "step": 30575 }, { "epoch": 4.554661900506405, "grad_norm": 0.19472402334213257, "learning_rate": 3.3101956430828715e-05, "loss": 0.7913, "num_input_tokens_seen": 17634160, "step": 30580 }, { "epoch": 4.555406613047364, "grad_norm": 0.2344467043876648, "learning_rate": 3.309580805499692e-05, "loss": 0.8015, "num_input_tokens_seen": 17637168, "step": 30585 }, { "epoch": 4.556151325588323, "grad_norm": 0.1957375705242157, "learning_rate": 3.3089659132083327e-05, "loss": 0.7791, "num_input_tokens_seen": 17640144, "step": 30590 }, { "epoch": 4.556896038129282, "grad_norm": 0.3535217046737671, "learning_rate": 3.3083509662503466e-05, "loss": 0.8048, "num_input_tokens_seen": 17643088, "step": 30595 }, { "epoch": 4.557640750670242, "grad_norm": 0.27980974316596985, "learning_rate": 3.3077359646672884e-05, "loss": 0.7612, "num_input_tokens_seen": 17645904, "step": 30600 }, { "epoch": 4.5583854632112, "grad_norm": 0.3688909709453583, "learning_rate": 3.307120908500718e-05, "loss": 0.8437, "num_input_tokens_seen": 17649040, "step": 30605 }, { "epoch": 4.559130175752159, "grad_norm": 0.23035761713981628, "learning_rate": 3.3065057977921986e-05, "loss": 0.8089, "num_input_tokens_seen": 17652240, "step": 30610 }, { "epoch": 4.559874888293119, "grad_norm": 0.18699049949645996, "learning_rate": 3.305890632583295e-05, "loss": 0.7776, "num_input_tokens_seen": 17654800, "step": 30615 }, { "epoch": 4.5606196008340785, "grad_norm": 0.2681196331977844, "learning_rate": 3.30527541291558e-05, "loss": 0.7984, "num_input_tokens_seen": 17657744, "step": 30620 }, { "epoch": 4.561364313375037, "grad_norm": 0.2660687565803528, "learning_rate": 3.304660138830626e-05, "loss": 0.7844, "num_input_tokens_seen": 17660784, "step": 30625 }, { "epoch": 4.562109025915996, "grad_norm": 0.23370856046676636, "learning_rate": 3.3040448103700124e-05, "loss": 0.8055, "num_input_tokens_seen": 17663504, "step": 30630 }, { "epoch": 4.562853738456956, "grad_norm": 0.26302242279052734, "learning_rate": 3.303429427575319e-05, "loss": 0.8238, "num_input_tokens_seen": 17666128, "step": 30635 }, { "epoch": 4.563598450997915, "grad_norm": 0.3011392056941986, "learning_rate": 3.30281399048813e-05, "loss": 0.8059, "num_input_tokens_seen": 17669296, "step": 30640 }, { "epoch": 4.564343163538874, "grad_norm": 0.19048738479614258, "learning_rate": 3.302198499150038e-05, "loss": 0.811, "num_input_tokens_seen": 17672144, "step": 30645 }, { "epoch": 4.565087876079833, "grad_norm": 0.19466781616210938, "learning_rate": 3.301582953602631e-05, "loss": 0.8318, "num_input_tokens_seen": 17674864, "step": 30650 }, { "epoch": 4.565832588620792, "grad_norm": 0.1801174134016037, "learning_rate": 3.300967353887507e-05, "loss": 0.8005, "num_input_tokens_seen": 17677712, "step": 30655 }, { "epoch": 4.566577301161751, "grad_norm": 0.19941450655460358, "learning_rate": 3.300351700046267e-05, "loss": 0.7726, "num_input_tokens_seen": 17680176, "step": 30660 }, { "epoch": 4.567322013702711, "grad_norm": 0.3014239966869354, "learning_rate": 3.299735992120513e-05, "loss": 0.839, "num_input_tokens_seen": 17683088, "step": 30665 }, { "epoch": 4.56806672624367, "grad_norm": 0.2685741186141968, "learning_rate": 3.299120230151852e-05, "loss": 0.8411, "num_input_tokens_seen": 17686256, "step": 30670 }, { "epoch": 4.568811438784629, "grad_norm": 0.2251213937997818, "learning_rate": 3.298504414181894e-05, "loss": 0.8007, "num_input_tokens_seen": 17688976, "step": 30675 }, { "epoch": 4.569556151325588, "grad_norm": 0.2333323210477829, "learning_rate": 3.297888544252255e-05, "loss": 0.7868, "num_input_tokens_seen": 17691728, "step": 30680 }, { "epoch": 4.570300863866548, "grad_norm": 0.2475966215133667, "learning_rate": 3.2972726204045515e-05, "loss": 0.7935, "num_input_tokens_seen": 17694224, "step": 30685 }, { "epoch": 4.571045576407506, "grad_norm": 0.1768454909324646, "learning_rate": 3.2966566426804057e-05, "loss": 0.8268, "num_input_tokens_seen": 17696944, "step": 30690 }, { "epoch": 4.571790288948466, "grad_norm": 0.25204798579216003, "learning_rate": 3.296040611121444e-05, "loss": 0.8068, "num_input_tokens_seen": 17699632, "step": 30695 }, { "epoch": 4.572535001489425, "grad_norm": 0.28106406331062317, "learning_rate": 3.295424525769293e-05, "loss": 0.8039, "num_input_tokens_seen": 17702384, "step": 30700 }, { "epoch": 4.5732797140303845, "grad_norm": 0.23427164554595947, "learning_rate": 3.2948083866655865e-05, "loss": 0.8013, "num_input_tokens_seen": 17705008, "step": 30705 }, { "epoch": 4.574024426571343, "grad_norm": 0.32512879371643066, "learning_rate": 3.294192193851963e-05, "loss": 0.8024, "num_input_tokens_seen": 17707952, "step": 30710 }, { "epoch": 4.574769139112303, "grad_norm": 0.22756049036979675, "learning_rate": 3.293575947370057e-05, "loss": 0.7967, "num_input_tokens_seen": 17710768, "step": 30715 }, { "epoch": 4.575513851653262, "grad_norm": 0.22264206409454346, "learning_rate": 3.2929596472615165e-05, "loss": 0.7966, "num_input_tokens_seen": 17713456, "step": 30720 }, { "epoch": 4.576258564194221, "grad_norm": 0.23481236398220062, "learning_rate": 3.292343293567986e-05, "loss": 0.8331, "num_input_tokens_seen": 17716208, "step": 30725 }, { "epoch": 4.57700327673518, "grad_norm": 0.2068309336900711, "learning_rate": 3.291726886331119e-05, "loss": 0.8119, "num_input_tokens_seen": 17719312, "step": 30730 }, { "epoch": 4.57774798927614, "grad_norm": 0.22240158915519714, "learning_rate": 3.291110425592566e-05, "loss": 0.8071, "num_input_tokens_seen": 17722160, "step": 30735 }, { "epoch": 4.578492701817098, "grad_norm": 0.21021263301372528, "learning_rate": 3.290493911393988e-05, "loss": 0.7911, "num_input_tokens_seen": 17725008, "step": 30740 }, { "epoch": 4.579237414358058, "grad_norm": 0.16710242629051208, "learning_rate": 3.289877343777045e-05, "loss": 0.7954, "num_input_tokens_seen": 17727824, "step": 30745 }, { "epoch": 4.579982126899017, "grad_norm": 0.23741017282009125, "learning_rate": 3.2892607227834024e-05, "loss": 0.8178, "num_input_tokens_seen": 17731312, "step": 30750 }, { "epoch": 4.5807268394399765, "grad_norm": 0.2347629964351654, "learning_rate": 3.288644048454729e-05, "loss": 0.8164, "num_input_tokens_seen": 17734032, "step": 30755 }, { "epoch": 4.581471551980935, "grad_norm": 0.2195577472448349, "learning_rate": 3.288027320832698e-05, "loss": 0.7806, "num_input_tokens_seen": 17736848, "step": 30760 }, { "epoch": 4.582216264521895, "grad_norm": 0.26777321100234985, "learning_rate": 3.287410539958984e-05, "loss": 0.8403, "num_input_tokens_seen": 17739792, "step": 30765 }, { "epoch": 4.582960977062854, "grad_norm": 0.20178630948066711, "learning_rate": 3.286793705875267e-05, "loss": 0.8356, "num_input_tokens_seen": 17742480, "step": 30770 }, { "epoch": 4.583705689603813, "grad_norm": 0.2375384420156479, "learning_rate": 3.2861768186232306e-05, "loss": 0.8205, "num_input_tokens_seen": 17745328, "step": 30775 }, { "epoch": 4.584450402144772, "grad_norm": 0.15990939736366272, "learning_rate": 3.2855598782445606e-05, "loss": 0.7889, "num_input_tokens_seen": 17748240, "step": 30780 }, { "epoch": 4.585195114685732, "grad_norm": 0.28940409421920776, "learning_rate": 3.2849428847809474e-05, "loss": 0.8039, "num_input_tokens_seen": 17750928, "step": 30785 }, { "epoch": 4.5859398272266905, "grad_norm": 0.22245104610919952, "learning_rate": 3.2843258382740866e-05, "loss": 0.8195, "num_input_tokens_seen": 17753840, "step": 30790 }, { "epoch": 4.586684539767649, "grad_norm": 0.27487727999687195, "learning_rate": 3.283708738765674e-05, "loss": 0.8149, "num_input_tokens_seen": 17756496, "step": 30795 }, { "epoch": 4.587429252308609, "grad_norm": 0.22900331020355225, "learning_rate": 3.283091586297411e-05, "loss": 0.7774, "num_input_tokens_seen": 17759184, "step": 30800 }, { "epoch": 4.5881739648495685, "grad_norm": 0.1622133105993271, "learning_rate": 3.2824743809110024e-05, "loss": 0.8138, "num_input_tokens_seen": 17761936, "step": 30805 }, { "epoch": 4.588918677390527, "grad_norm": 0.2727031707763672, "learning_rate": 3.281857122648157e-05, "loss": 0.8093, "num_input_tokens_seen": 17764880, "step": 30810 }, { "epoch": 4.589663389931486, "grad_norm": 0.26154911518096924, "learning_rate": 3.281239811550586e-05, "loss": 0.7901, "num_input_tokens_seen": 17767536, "step": 30815 }, { "epoch": 4.590408102472446, "grad_norm": 0.21219377219676971, "learning_rate": 3.280622447660004e-05, "loss": 0.8151, "num_input_tokens_seen": 17770384, "step": 30820 }, { "epoch": 4.591152815013404, "grad_norm": 0.20690348744392395, "learning_rate": 3.280005031018131e-05, "loss": 0.8058, "num_input_tokens_seen": 17773296, "step": 30825 }, { "epoch": 4.591897527554364, "grad_norm": 0.27650925517082214, "learning_rate": 3.2793875616666904e-05, "loss": 0.7973, "num_input_tokens_seen": 17775984, "step": 30830 }, { "epoch": 4.592642240095323, "grad_norm": 0.23348627984523773, "learning_rate": 3.278770039647406e-05, "loss": 0.8105, "num_input_tokens_seen": 17778736, "step": 30835 }, { "epoch": 4.5933869526362825, "grad_norm": 0.2236170768737793, "learning_rate": 3.278152465002008e-05, "loss": 0.8094, "num_input_tokens_seen": 17781584, "step": 30840 }, { "epoch": 4.594131665177241, "grad_norm": 0.258318692445755, "learning_rate": 3.277534837772232e-05, "loss": 0.8151, "num_input_tokens_seen": 17784304, "step": 30845 }, { "epoch": 4.594876377718201, "grad_norm": 0.2618939280509949, "learning_rate": 3.276917157999811e-05, "loss": 0.8093, "num_input_tokens_seen": 17787376, "step": 30850 }, { "epoch": 4.59562109025916, "grad_norm": 0.20110449194908142, "learning_rate": 3.276299425726489e-05, "loss": 0.7868, "num_input_tokens_seen": 17790128, "step": 30855 }, { "epoch": 4.596365802800119, "grad_norm": 0.23206137120723724, "learning_rate": 3.275681640994007e-05, "loss": 0.7914, "num_input_tokens_seen": 17792848, "step": 30860 }, { "epoch": 4.597110515341078, "grad_norm": 0.3146090507507324, "learning_rate": 3.275063803844113e-05, "loss": 0.7916, "num_input_tokens_seen": 17795600, "step": 30865 }, { "epoch": 4.597855227882038, "grad_norm": 0.17212162911891937, "learning_rate": 3.274445914318559e-05, "loss": 0.8038, "num_input_tokens_seen": 17798096, "step": 30870 }, { "epoch": 4.5985999404229965, "grad_norm": 0.20890890061855316, "learning_rate": 3.273827972459099e-05, "loss": 0.7753, "num_input_tokens_seen": 17801072, "step": 30875 }, { "epoch": 4.599344652963956, "grad_norm": 0.24938131868839264, "learning_rate": 3.27320997830749e-05, "loss": 0.8148, "num_input_tokens_seen": 17804016, "step": 30880 }, { "epoch": 4.600089365504915, "grad_norm": 0.2520475387573242, "learning_rate": 3.2725919319054946e-05, "loss": 0.8102, "num_input_tokens_seen": 17806672, "step": 30885 }, { "epoch": 4.6008340780458745, "grad_norm": 0.21360035240650177, "learning_rate": 3.271973833294877e-05, "loss": 0.8053, "num_input_tokens_seen": 17809840, "step": 30890 }, { "epoch": 4.601578790586833, "grad_norm": 0.2671970725059509, "learning_rate": 3.2713556825174074e-05, "loss": 0.7833, "num_input_tokens_seen": 17812816, "step": 30895 }, { "epoch": 4.602323503127793, "grad_norm": 0.1886308789253235, "learning_rate": 3.270737479614856e-05, "loss": 0.7903, "num_input_tokens_seen": 17815664, "step": 30900 }, { "epoch": 4.603068215668752, "grad_norm": 0.19590915739536285, "learning_rate": 3.270119224629e-05, "loss": 0.7939, "num_input_tokens_seen": 17818224, "step": 30905 }, { "epoch": 4.603812928209711, "grad_norm": 0.2949199080467224, "learning_rate": 3.269500917601618e-05, "loss": 0.7816, "num_input_tokens_seen": 17821008, "step": 30910 }, { "epoch": 4.60455764075067, "grad_norm": 0.23234862089157104, "learning_rate": 3.268882558574492e-05, "loss": 0.8332, "num_input_tokens_seen": 17823792, "step": 30915 }, { "epoch": 4.60530235329163, "grad_norm": 0.23501385748386383, "learning_rate": 3.268264147589409e-05, "loss": 0.7962, "num_input_tokens_seen": 17826704, "step": 30920 }, { "epoch": 4.6060470658325885, "grad_norm": 0.27966243028640747, "learning_rate": 3.2676456846881583e-05, "loss": 0.8167, "num_input_tokens_seen": 17829936, "step": 30925 }, { "epoch": 4.606791778373548, "grad_norm": 0.35109513998031616, "learning_rate": 3.267027169912533e-05, "loss": 0.822, "num_input_tokens_seen": 17832976, "step": 30930 }, { "epoch": 4.607536490914507, "grad_norm": 0.21549750864505768, "learning_rate": 3.2664086033043304e-05, "loss": 0.8377, "num_input_tokens_seen": 17835952, "step": 30935 }, { "epoch": 4.6082812034554665, "grad_norm": 0.24945758283138275, "learning_rate": 3.265789984905351e-05, "loss": 0.7927, "num_input_tokens_seen": 17839056, "step": 30940 }, { "epoch": 4.609025915996425, "grad_norm": 0.16810640692710876, "learning_rate": 3.265171314757397e-05, "loss": 0.8201, "num_input_tokens_seen": 17841968, "step": 30945 }, { "epoch": 4.609770628537385, "grad_norm": 0.24803940951824188, "learning_rate": 3.264552592902277e-05, "loss": 0.7818, "num_input_tokens_seen": 17844912, "step": 30950 }, { "epoch": 4.610515341078344, "grad_norm": 0.23509083688259125, "learning_rate": 3.2639338193818006e-05, "loss": 0.8011, "num_input_tokens_seen": 17847888, "step": 30955 }, { "epoch": 4.6112600536193025, "grad_norm": 0.22614796459674835, "learning_rate": 3.2633149942377834e-05, "loss": 0.7936, "num_input_tokens_seen": 17850960, "step": 30960 }, { "epoch": 4.612004766160262, "grad_norm": 0.27556905150413513, "learning_rate": 3.2626961175120414e-05, "loss": 0.8116, "num_input_tokens_seen": 17853872, "step": 30965 }, { "epoch": 4.612749478701222, "grad_norm": 0.2770381569862366, "learning_rate": 3.262077189246398e-05, "loss": 0.8094, "num_input_tokens_seen": 17856816, "step": 30970 }, { "epoch": 4.6134941912421805, "grad_norm": 0.3215067386627197, "learning_rate": 3.261458209482675e-05, "loss": 0.8084, "num_input_tokens_seen": 17859984, "step": 30975 }, { "epoch": 4.614238903783139, "grad_norm": 0.19808711111545563, "learning_rate": 3.260839178262703e-05, "loss": 0.8114, "num_input_tokens_seen": 17862640, "step": 30980 }, { "epoch": 4.614983616324099, "grad_norm": 0.20726098120212555, "learning_rate": 3.260220095628312e-05, "loss": 0.796, "num_input_tokens_seen": 17865456, "step": 30985 }, { "epoch": 4.615728328865059, "grad_norm": 0.19070424139499664, "learning_rate": 3.259600961621339e-05, "loss": 0.7948, "num_input_tokens_seen": 17868048, "step": 30990 }, { "epoch": 4.616473041406017, "grad_norm": 0.238511323928833, "learning_rate": 3.25898177628362e-05, "loss": 0.8278, "num_input_tokens_seen": 17870960, "step": 30995 }, { "epoch": 4.617217753946976, "grad_norm": 0.20710867643356323, "learning_rate": 3.258362539656999e-05, "loss": 0.8114, "num_input_tokens_seen": 17874032, "step": 31000 }, { "epoch": 4.617962466487936, "grad_norm": 0.17913572490215302, "learning_rate": 3.2577432517833204e-05, "loss": 0.7789, "num_input_tokens_seen": 17876944, "step": 31005 }, { "epoch": 4.6187071790288945, "grad_norm": 0.19244541227817535, "learning_rate": 3.257123912704435e-05, "loss": 0.7807, "num_input_tokens_seen": 17879728, "step": 31010 }, { "epoch": 4.619451891569854, "grad_norm": 0.21896937489509583, "learning_rate": 3.2565045224621923e-05, "loss": 0.8061, "num_input_tokens_seen": 17882640, "step": 31015 }, { "epoch": 4.620196604110813, "grad_norm": 0.20330147445201874, "learning_rate": 3.25588508109845e-05, "loss": 0.815, "num_input_tokens_seen": 17885744, "step": 31020 }, { "epoch": 4.6209413166517725, "grad_norm": 0.19711072742938995, "learning_rate": 3.2552655886550674e-05, "loss": 0.7781, "num_input_tokens_seen": 17888080, "step": 31025 }, { "epoch": 4.621686029192731, "grad_norm": 0.2767471373081207, "learning_rate": 3.254646045173907e-05, "loss": 0.8092, "num_input_tokens_seen": 17890896, "step": 31030 }, { "epoch": 4.622430741733691, "grad_norm": 0.44264930486679077, "learning_rate": 3.254026450696835e-05, "loss": 0.8237, "num_input_tokens_seen": 17893712, "step": 31035 }, { "epoch": 4.62317545427465, "grad_norm": 0.29835233092308044, "learning_rate": 3.253406805265721e-05, "loss": 0.8204, "num_input_tokens_seen": 17896528, "step": 31040 }, { "epoch": 4.623920166815609, "grad_norm": 0.2817884385585785, "learning_rate": 3.252787108922438e-05, "loss": 0.785, "num_input_tokens_seen": 17899312, "step": 31045 }, { "epoch": 4.624664879356568, "grad_norm": 0.25666919350624084, "learning_rate": 3.252167361708863e-05, "loss": 0.8021, "num_input_tokens_seen": 17902480, "step": 31050 }, { "epoch": 4.625409591897528, "grad_norm": 0.1747918725013733, "learning_rate": 3.251547563666876e-05, "loss": 0.7989, "num_input_tokens_seen": 17905392, "step": 31055 }, { "epoch": 4.6261543044384865, "grad_norm": 0.22034220397472382, "learning_rate": 3.25092771483836e-05, "loss": 0.8136, "num_input_tokens_seen": 17908272, "step": 31060 }, { "epoch": 4.626899016979446, "grad_norm": 0.21972669661045074, "learning_rate": 3.2503078152652024e-05, "loss": 0.7784, "num_input_tokens_seen": 17911152, "step": 31065 }, { "epoch": 4.627643729520405, "grad_norm": 0.22848248481750488, "learning_rate": 3.2496878649892924e-05, "loss": 0.7976, "num_input_tokens_seen": 17913872, "step": 31070 }, { "epoch": 4.628388442061365, "grad_norm": 0.23822322487831116, "learning_rate": 3.2490678640525255e-05, "loss": 0.8222, "num_input_tokens_seen": 17916464, "step": 31075 }, { "epoch": 4.629133154602323, "grad_norm": 0.23476308584213257, "learning_rate": 3.248447812496797e-05, "loss": 0.8646, "num_input_tokens_seen": 17919376, "step": 31080 }, { "epoch": 4.629877867143283, "grad_norm": 0.21480992436408997, "learning_rate": 3.2478277103640086e-05, "loss": 0.8093, "num_input_tokens_seen": 17922128, "step": 31085 }, { "epoch": 4.630622579684242, "grad_norm": 0.16266462206840515, "learning_rate": 3.247207557696064e-05, "loss": 0.8119, "num_input_tokens_seen": 17924880, "step": 31090 }, { "epoch": 4.631367292225201, "grad_norm": 0.21447241306304932, "learning_rate": 3.2465873545348715e-05, "loss": 0.7804, "num_input_tokens_seen": 17928208, "step": 31095 }, { "epoch": 4.63211200476616, "grad_norm": 0.24473689496517181, "learning_rate": 3.2459671009223394e-05, "loss": 0.7927, "num_input_tokens_seen": 17931760, "step": 31100 }, { "epoch": 4.63285671730712, "grad_norm": 0.16858817636966705, "learning_rate": 3.245346796900384e-05, "loss": 0.794, "num_input_tokens_seen": 17934672, "step": 31105 }, { "epoch": 4.6336014298480785, "grad_norm": 0.22673603892326355, "learning_rate": 3.244726442510923e-05, "loss": 0.8202, "num_input_tokens_seen": 17937488, "step": 31110 }, { "epoch": 4.634346142389038, "grad_norm": 0.2648271322250366, "learning_rate": 3.244106037795877e-05, "loss": 0.8468, "num_input_tokens_seen": 17940272, "step": 31115 }, { "epoch": 4.635090854929997, "grad_norm": 0.22617577016353607, "learning_rate": 3.243485582797169e-05, "loss": 0.8047, "num_input_tokens_seen": 17943312, "step": 31120 }, { "epoch": 4.635835567470957, "grad_norm": 0.23104895651340485, "learning_rate": 3.242865077556729e-05, "loss": 0.7976, "num_input_tokens_seen": 17946448, "step": 31125 }, { "epoch": 4.636580280011915, "grad_norm": 0.2498922049999237, "learning_rate": 3.2422445221164876e-05, "loss": 0.8133, "num_input_tokens_seen": 17949328, "step": 31130 }, { "epoch": 4.637324992552875, "grad_norm": 0.2175111621618271, "learning_rate": 3.241623916518378e-05, "loss": 0.811, "num_input_tokens_seen": 17952112, "step": 31135 }, { "epoch": 4.638069705093834, "grad_norm": 0.31320685148239136, "learning_rate": 3.2410032608043405e-05, "loss": 0.8122, "num_input_tokens_seen": 17954608, "step": 31140 }, { "epoch": 4.6388144176347925, "grad_norm": 0.2314988225698471, "learning_rate": 3.2403825550163144e-05, "loss": 0.8096, "num_input_tokens_seen": 17957680, "step": 31145 }, { "epoch": 4.639559130175752, "grad_norm": 0.24279163777828217, "learning_rate": 3.239761799196246e-05, "loss": 0.7964, "num_input_tokens_seen": 17960560, "step": 31150 }, { "epoch": 4.640303842716712, "grad_norm": 0.2154039740562439, "learning_rate": 3.2391409933860825e-05, "loss": 0.8292, "num_input_tokens_seen": 17963408, "step": 31155 }, { "epoch": 4.641048555257671, "grad_norm": 0.2330215722322464, "learning_rate": 3.238520137627777e-05, "loss": 0.8063, "num_input_tokens_seen": 17966160, "step": 31160 }, { "epoch": 4.641793267798629, "grad_norm": 0.27665814757347107, "learning_rate": 3.237899231963282e-05, "loss": 0.8051, "num_input_tokens_seen": 17969296, "step": 31165 }, { "epoch": 4.642537980339589, "grad_norm": 0.24326671659946442, "learning_rate": 3.237278276434557e-05, "loss": 0.8156, "num_input_tokens_seen": 17972272, "step": 31170 }, { "epoch": 4.643282692880548, "grad_norm": 0.20876544713974, "learning_rate": 3.236657271083564e-05, "loss": 0.7908, "num_input_tokens_seen": 17975024, "step": 31175 }, { "epoch": 4.644027405421507, "grad_norm": 0.3041395843029022, "learning_rate": 3.236036215952267e-05, "loss": 0.8082, "num_input_tokens_seen": 17978064, "step": 31180 }, { "epoch": 4.644772117962466, "grad_norm": 0.21997220814228058, "learning_rate": 3.2354151110826355e-05, "loss": 0.807, "num_input_tokens_seen": 17980944, "step": 31185 }, { "epoch": 4.645516830503426, "grad_norm": 0.22181278467178345, "learning_rate": 3.234793956516641e-05, "loss": 0.7953, "num_input_tokens_seen": 17983664, "step": 31190 }, { "epoch": 4.6462615430443845, "grad_norm": 0.2188137024641037, "learning_rate": 3.234172752296259e-05, "loss": 0.793, "num_input_tokens_seen": 17986544, "step": 31195 }, { "epoch": 4.647006255585344, "grad_norm": 0.19276849925518036, "learning_rate": 3.233551498463466e-05, "loss": 0.7979, "num_input_tokens_seen": 17989456, "step": 31200 }, { "epoch": 4.647750968126303, "grad_norm": 0.23253217339515686, "learning_rate": 3.2329301950602456e-05, "loss": 0.806, "num_input_tokens_seen": 17992496, "step": 31205 }, { "epoch": 4.648495680667263, "grad_norm": 0.4238493740558624, "learning_rate": 3.232308842128583e-05, "loss": 0.7892, "num_input_tokens_seen": 17995344, "step": 31210 }, { "epoch": 4.649240393208221, "grad_norm": 0.2769306004047394, "learning_rate": 3.2316874397104656e-05, "loss": 0.7728, "num_input_tokens_seen": 17998256, "step": 31215 }, { "epoch": 4.649985105749181, "grad_norm": 0.22557394206523895, "learning_rate": 3.231065987847885e-05, "loss": 0.8017, "num_input_tokens_seen": 18001360, "step": 31220 }, { "epoch": 4.65072981829014, "grad_norm": 0.2576771080493927, "learning_rate": 3.2304444865828394e-05, "loss": 0.7941, "num_input_tokens_seen": 18004304, "step": 31225 }, { "epoch": 4.651474530831099, "grad_norm": 0.3767901360988617, "learning_rate": 3.229822935957324e-05, "loss": 0.8097, "num_input_tokens_seen": 18007120, "step": 31230 }, { "epoch": 4.652219243372058, "grad_norm": 0.25253674387931824, "learning_rate": 3.2292013360133416e-05, "loss": 0.7782, "num_input_tokens_seen": 18009808, "step": 31235 }, { "epoch": 4.652963955913018, "grad_norm": 0.21331708133220673, "learning_rate": 3.228579686792898e-05, "loss": 0.7936, "num_input_tokens_seen": 18012592, "step": 31240 }, { "epoch": 4.653708668453977, "grad_norm": 0.26110973954200745, "learning_rate": 3.227957988338001e-05, "loss": 0.825, "num_input_tokens_seen": 18015568, "step": 31245 }, { "epoch": 4.654453380994936, "grad_norm": 0.28123366832733154, "learning_rate": 3.227336240690663e-05, "loss": 0.7944, "num_input_tokens_seen": 18018384, "step": 31250 }, { "epoch": 4.655198093535895, "grad_norm": 0.28719615936279297, "learning_rate": 3.226714443892899e-05, "loss": 0.8162, "num_input_tokens_seen": 18021104, "step": 31255 }, { "epoch": 4.655942806076855, "grad_norm": 0.17905978858470917, "learning_rate": 3.226092597986728e-05, "loss": 0.8039, "num_input_tokens_seen": 18023824, "step": 31260 }, { "epoch": 4.656687518617813, "grad_norm": 0.16022989153862, "learning_rate": 3.225470703014171e-05, "loss": 0.7861, "num_input_tokens_seen": 18026704, "step": 31265 }, { "epoch": 4.657432231158773, "grad_norm": 0.2411012202501297, "learning_rate": 3.224848759017253e-05, "loss": 0.8255, "num_input_tokens_seen": 18029520, "step": 31270 }, { "epoch": 4.658176943699732, "grad_norm": 0.18200910091400146, "learning_rate": 3.224226766038004e-05, "loss": 0.8062, "num_input_tokens_seen": 18032176, "step": 31275 }, { "epoch": 4.658921656240691, "grad_norm": 0.2126038521528244, "learning_rate": 3.223604724118453e-05, "loss": 0.8154, "num_input_tokens_seen": 18035248, "step": 31280 }, { "epoch": 4.65966636878165, "grad_norm": 0.19061046838760376, "learning_rate": 3.2229826333006374e-05, "loss": 0.7886, "num_input_tokens_seen": 18037936, "step": 31285 }, { "epoch": 4.66041108132261, "grad_norm": 0.27410489320755005, "learning_rate": 3.222360493626595e-05, "loss": 0.8434, "num_input_tokens_seen": 18040624, "step": 31290 }, { "epoch": 4.661155793863569, "grad_norm": 0.22110240161418915, "learning_rate": 3.2217383051383676e-05, "loss": 0.7895, "num_input_tokens_seen": 18043536, "step": 31295 }, { "epoch": 4.661900506404528, "grad_norm": 0.23136548697948456, "learning_rate": 3.2211160678779994e-05, "loss": 0.8132, "num_input_tokens_seen": 18046480, "step": 31300 }, { "epoch": 4.662645218945487, "grad_norm": 0.2492849975824356, "learning_rate": 3.22049378188754e-05, "loss": 0.7839, "num_input_tokens_seen": 18049328, "step": 31305 }, { "epoch": 4.663389931486446, "grad_norm": 0.2615392804145813, "learning_rate": 3.219871447209039e-05, "loss": 0.8203, "num_input_tokens_seen": 18052368, "step": 31310 }, { "epoch": 4.664134644027405, "grad_norm": 0.18180632591247559, "learning_rate": 3.219249063884553e-05, "loss": 0.7888, "num_input_tokens_seen": 18055280, "step": 31315 }, { "epoch": 4.664879356568365, "grad_norm": 0.203726664185524, "learning_rate": 3.2186266319561395e-05, "loss": 0.8032, "num_input_tokens_seen": 18058416, "step": 31320 }, { "epoch": 4.665624069109324, "grad_norm": 0.22608022391796112, "learning_rate": 3.21800415146586e-05, "loss": 0.8255, "num_input_tokens_seen": 18061008, "step": 31325 }, { "epoch": 4.666368781650283, "grad_norm": 0.18939054012298584, "learning_rate": 3.217381622455778e-05, "loss": 0.7802, "num_input_tokens_seen": 18064080, "step": 31330 }, { "epoch": 4.667113494191242, "grad_norm": 0.25342079997062683, "learning_rate": 3.216759044967965e-05, "loss": 0.8023, "num_input_tokens_seen": 18067120, "step": 31335 }, { "epoch": 4.667858206732202, "grad_norm": 0.2704092264175415, "learning_rate": 3.2161364190444884e-05, "loss": 0.8205, "num_input_tokens_seen": 18069808, "step": 31340 }, { "epoch": 4.668602919273161, "grad_norm": 0.18244525790214539, "learning_rate": 3.2155137447274245e-05, "loss": 0.7909, "num_input_tokens_seen": 18072688, "step": 31345 }, { "epoch": 4.669347631814119, "grad_norm": 0.20523399114608765, "learning_rate": 3.2148910220588495e-05, "loss": 0.8188, "num_input_tokens_seen": 18075440, "step": 31350 }, { "epoch": 4.670092344355079, "grad_norm": 0.33682042360305786, "learning_rate": 3.2142682510808474e-05, "loss": 0.8092, "num_input_tokens_seen": 18078672, "step": 31355 }, { "epoch": 4.670837056896038, "grad_norm": 0.2077380269765854, "learning_rate": 3.213645431835501e-05, "loss": 0.8096, "num_input_tokens_seen": 18081680, "step": 31360 }, { "epoch": 4.671581769436997, "grad_norm": 0.2853768467903137, "learning_rate": 3.213022564364897e-05, "loss": 0.7922, "num_input_tokens_seen": 18084656, "step": 31365 }, { "epoch": 4.672326481977956, "grad_norm": 0.26552826166152954, "learning_rate": 3.212399648711127e-05, "loss": 0.7995, "num_input_tokens_seen": 18087472, "step": 31370 }, { "epoch": 4.673071194518916, "grad_norm": 0.20284190773963928, "learning_rate": 3.2117766849162855e-05, "loss": 0.8378, "num_input_tokens_seen": 18090640, "step": 31375 }, { "epoch": 4.673815907059875, "grad_norm": 0.23899608850479126, "learning_rate": 3.211153673022469e-05, "loss": 0.8017, "num_input_tokens_seen": 18093456, "step": 31380 }, { "epoch": 4.674560619600834, "grad_norm": 0.17026746273040771, "learning_rate": 3.2105306130717786e-05, "loss": 0.7955, "num_input_tokens_seen": 18096304, "step": 31385 }, { "epoch": 4.675305332141793, "grad_norm": 0.270662784576416, "learning_rate": 3.209907505106319e-05, "loss": 0.8356, "num_input_tokens_seen": 18099184, "step": 31390 }, { "epoch": 4.676050044682753, "grad_norm": 0.23956842720508575, "learning_rate": 3.209284349168196e-05, "loss": 0.7959, "num_input_tokens_seen": 18102352, "step": 31395 }, { "epoch": 4.676794757223711, "grad_norm": 0.18854865431785583, "learning_rate": 3.20866114529952e-05, "loss": 0.8021, "num_input_tokens_seen": 18104976, "step": 31400 }, { "epoch": 4.677539469764671, "grad_norm": 0.19478021562099457, "learning_rate": 3.208037893542406e-05, "loss": 0.7998, "num_input_tokens_seen": 18107792, "step": 31405 }, { "epoch": 4.67828418230563, "grad_norm": 0.2344442754983902, "learning_rate": 3.207414593938969e-05, "loss": 0.8185, "num_input_tokens_seen": 18110608, "step": 31410 }, { "epoch": 4.6790288948465895, "grad_norm": 0.1962953507900238, "learning_rate": 3.2067912465313305e-05, "loss": 0.7846, "num_input_tokens_seen": 18113392, "step": 31415 }, { "epoch": 4.679773607387548, "grad_norm": 0.196961909532547, "learning_rate": 3.2061678513616125e-05, "loss": 0.7902, "num_input_tokens_seen": 18116176, "step": 31420 }, { "epoch": 4.680518319928508, "grad_norm": 0.17741431295871735, "learning_rate": 3.205544408471943e-05, "loss": 0.7864, "num_input_tokens_seen": 18118960, "step": 31425 }, { "epoch": 4.681263032469467, "grad_norm": 0.346139132976532, "learning_rate": 3.20492091790445e-05, "loss": 0.8524, "num_input_tokens_seen": 18121616, "step": 31430 }, { "epoch": 4.682007745010426, "grad_norm": 0.28175029158592224, "learning_rate": 3.2042973797012674e-05, "loss": 0.7954, "num_input_tokens_seen": 18124176, "step": 31435 }, { "epoch": 4.682752457551385, "grad_norm": 0.18755105137825012, "learning_rate": 3.203673793904532e-05, "loss": 0.7938, "num_input_tokens_seen": 18126992, "step": 31440 }, { "epoch": 4.683497170092345, "grad_norm": 0.21023361384868622, "learning_rate": 3.2030501605563824e-05, "loss": 0.824, "num_input_tokens_seen": 18130064, "step": 31445 }, { "epoch": 4.684241882633303, "grad_norm": 0.2612200379371643, "learning_rate": 3.202426479698961e-05, "loss": 0.8304, "num_input_tokens_seen": 18133072, "step": 31450 }, { "epoch": 4.684986595174263, "grad_norm": 0.17852593958377838, "learning_rate": 3.201802751374415e-05, "loss": 0.8055, "num_input_tokens_seen": 18135856, "step": 31455 }, { "epoch": 4.685731307715222, "grad_norm": 0.2038068026304245, "learning_rate": 3.201178975624891e-05, "loss": 0.7927, "num_input_tokens_seen": 18138768, "step": 31460 }, { "epoch": 4.6864760202561815, "grad_norm": 0.22415609657764435, "learning_rate": 3.200555152492543e-05, "loss": 0.813, "num_input_tokens_seen": 18141744, "step": 31465 }, { "epoch": 4.68722073279714, "grad_norm": 0.24665649235248566, "learning_rate": 3.199931282019527e-05, "loss": 0.7804, "num_input_tokens_seen": 18144720, "step": 31470 }, { "epoch": 4.687965445338099, "grad_norm": 0.1684965342283249, "learning_rate": 3.1993073642479996e-05, "loss": 0.7808, "num_input_tokens_seen": 18147472, "step": 31475 }, { "epoch": 4.688710157879059, "grad_norm": 0.14074531197547913, "learning_rate": 3.1986833992201235e-05, "loss": 0.7863, "num_input_tokens_seen": 18150384, "step": 31480 }, { "epoch": 4.689454870420018, "grad_norm": 0.33337947726249695, "learning_rate": 3.198059386978064e-05, "loss": 0.8045, "num_input_tokens_seen": 18153488, "step": 31485 }, { "epoch": 4.690199582960977, "grad_norm": 0.16076511144638062, "learning_rate": 3.19743532756399e-05, "loss": 0.7936, "num_input_tokens_seen": 18156368, "step": 31490 }, { "epoch": 4.690944295501936, "grad_norm": 0.19701407849788666, "learning_rate": 3.1968112210200715e-05, "loss": 0.7901, "num_input_tokens_seen": 18159344, "step": 31495 }, { "epoch": 4.6916890080428955, "grad_norm": 0.2415996640920639, "learning_rate": 3.1961870673884845e-05, "loss": 0.7925, "num_input_tokens_seen": 18162320, "step": 31500 }, { "epoch": 4.692433720583855, "grad_norm": 0.25962287187576294, "learning_rate": 3.1955628667114055e-05, "loss": 0.8146, "num_input_tokens_seen": 18165136, "step": 31505 }, { "epoch": 4.693178433124814, "grad_norm": 0.17057926952838898, "learning_rate": 3.1949386190310154e-05, "loss": 0.8449, "num_input_tokens_seen": 18168112, "step": 31510 }, { "epoch": 4.693923145665773, "grad_norm": 0.2095024287700653, "learning_rate": 3.1943143243895e-05, "loss": 0.7813, "num_input_tokens_seen": 18171088, "step": 31515 }, { "epoch": 4.694667858206732, "grad_norm": 0.2696448862552643, "learning_rate": 3.193689982829044e-05, "loss": 0.7955, "num_input_tokens_seen": 18174288, "step": 31520 }, { "epoch": 4.695412570747691, "grad_norm": 0.35050061345100403, "learning_rate": 3.1930655943918405e-05, "loss": 0.8196, "num_input_tokens_seen": 18177296, "step": 31525 }, { "epoch": 4.696157283288651, "grad_norm": 0.1779838502407074, "learning_rate": 3.192441159120081e-05, "loss": 0.8047, "num_input_tokens_seen": 18179888, "step": 31530 }, { "epoch": 4.696901995829609, "grad_norm": 0.2141028195619583, "learning_rate": 3.1918166770559644e-05, "loss": 0.798, "num_input_tokens_seen": 18182992, "step": 31535 }, { "epoch": 4.697646708370569, "grad_norm": 0.3121775984764099, "learning_rate": 3.191192148241689e-05, "loss": 0.8215, "num_input_tokens_seen": 18185968, "step": 31540 }, { "epoch": 4.698391420911528, "grad_norm": 0.2208128124475479, "learning_rate": 3.190567572719457e-05, "loss": 0.8028, "num_input_tokens_seen": 18188560, "step": 31545 }, { "epoch": 4.6991361334524875, "grad_norm": 0.25366419553756714, "learning_rate": 3.189942950531478e-05, "loss": 0.8261, "num_input_tokens_seen": 18191696, "step": 31550 }, { "epoch": 4.699880845993446, "grad_norm": 0.27307969331741333, "learning_rate": 3.189318281719959e-05, "loss": 0.7939, "num_input_tokens_seen": 18194480, "step": 31555 }, { "epoch": 4.700625558534406, "grad_norm": 0.2384364753961563, "learning_rate": 3.1886935663271125e-05, "loss": 0.8135, "num_input_tokens_seen": 18197552, "step": 31560 }, { "epoch": 4.701370271075365, "grad_norm": 0.21404947340488434, "learning_rate": 3.188068804395155e-05, "loss": 0.8165, "num_input_tokens_seen": 18200528, "step": 31565 }, { "epoch": 4.702114983616324, "grad_norm": 0.1811143308877945, "learning_rate": 3.1874439959663055e-05, "loss": 0.7934, "num_input_tokens_seen": 18203216, "step": 31570 }, { "epoch": 4.702859696157283, "grad_norm": 0.22337879240512848, "learning_rate": 3.1868191410827855e-05, "loss": 0.7902, "num_input_tokens_seen": 18206192, "step": 31575 }, { "epoch": 4.703604408698243, "grad_norm": 0.1934257596731186, "learning_rate": 3.18619423978682e-05, "loss": 0.8093, "num_input_tokens_seen": 18209040, "step": 31580 }, { "epoch": 4.7043491212392015, "grad_norm": 0.27828940749168396, "learning_rate": 3.185569292120638e-05, "loss": 0.8311, "num_input_tokens_seen": 18211856, "step": 31585 }, { "epoch": 4.705093833780161, "grad_norm": 0.20022615790367126, "learning_rate": 3.1849442981264707e-05, "loss": 0.8149, "num_input_tokens_seen": 18214960, "step": 31590 }, { "epoch": 4.70583854632112, "grad_norm": 0.22112666070461273, "learning_rate": 3.184319257846553e-05, "loss": 0.7942, "num_input_tokens_seen": 18217744, "step": 31595 }, { "epoch": 4.7065832588620795, "grad_norm": 0.25621548295021057, "learning_rate": 3.183694171323121e-05, "loss": 0.7951, "num_input_tokens_seen": 18220464, "step": 31600 }, { "epoch": 4.707327971403038, "grad_norm": 0.27193745970726013, "learning_rate": 3.183069038598417e-05, "loss": 0.8067, "num_input_tokens_seen": 18223408, "step": 31605 }, { "epoch": 4.708072683943998, "grad_norm": 0.1957867592573166, "learning_rate": 3.182443859714685e-05, "loss": 0.826, "num_input_tokens_seen": 18226416, "step": 31610 }, { "epoch": 4.708817396484957, "grad_norm": 0.3186963200569153, "learning_rate": 3.181818634714171e-05, "loss": 0.8002, "num_input_tokens_seen": 18229072, "step": 31615 }, { "epoch": 4.709562109025916, "grad_norm": 0.2333928644657135, "learning_rate": 3.1811933636391266e-05, "loss": 0.8288, "num_input_tokens_seen": 18231824, "step": 31620 }, { "epoch": 4.710306821566875, "grad_norm": 0.18418917059898376, "learning_rate": 3.1805680465318035e-05, "loss": 0.8067, "num_input_tokens_seen": 18234576, "step": 31625 }, { "epoch": 4.711051534107835, "grad_norm": 0.21313261985778809, "learning_rate": 3.179942683434458e-05, "loss": 0.783, "num_input_tokens_seen": 18237360, "step": 31630 }, { "epoch": 4.7117962466487935, "grad_norm": 0.22607630491256714, "learning_rate": 3.1793172743893515e-05, "loss": 0.7863, "num_input_tokens_seen": 18240176, "step": 31635 }, { "epoch": 4.712540959189753, "grad_norm": 0.21750038862228394, "learning_rate": 3.178691819438746e-05, "loss": 0.7918, "num_input_tokens_seen": 18243344, "step": 31640 }, { "epoch": 4.713285671730712, "grad_norm": 0.20812587440013885, "learning_rate": 3.178066318624905e-05, "loss": 0.8084, "num_input_tokens_seen": 18246256, "step": 31645 }, { "epoch": 4.7140303842716715, "grad_norm": 0.2572188079357147, "learning_rate": 3.1774407719901e-05, "loss": 0.804, "num_input_tokens_seen": 18249328, "step": 31650 }, { "epoch": 4.71477509681263, "grad_norm": 0.21466724574565887, "learning_rate": 3.1768151795766025e-05, "loss": 0.8105, "num_input_tokens_seen": 18252304, "step": 31655 }, { "epoch": 4.715519809353589, "grad_norm": 0.23113222420215607, "learning_rate": 3.1761895414266865e-05, "loss": 0.7846, "num_input_tokens_seen": 18255120, "step": 31660 }, { "epoch": 4.716264521894549, "grad_norm": 0.1635008156299591, "learning_rate": 3.1755638575826295e-05, "loss": 0.8227, "num_input_tokens_seen": 18258096, "step": 31665 }, { "epoch": 4.717009234435508, "grad_norm": 0.21702054142951965, "learning_rate": 3.1749381280867146e-05, "loss": 0.785, "num_input_tokens_seen": 18260720, "step": 31670 }, { "epoch": 4.717753946976467, "grad_norm": 0.25785237550735474, "learning_rate": 3.174312352981225e-05, "loss": 0.7868, "num_input_tokens_seen": 18263440, "step": 31675 }, { "epoch": 4.718498659517426, "grad_norm": 0.20455197989940643, "learning_rate": 3.173686532308448e-05, "loss": 0.8137, "num_input_tokens_seen": 18266192, "step": 31680 }, { "epoch": 4.7192433720583855, "grad_norm": 0.1898738145828247, "learning_rate": 3.1730606661106736e-05, "loss": 0.7981, "num_input_tokens_seen": 18269392, "step": 31685 }, { "epoch": 4.719988084599344, "grad_norm": 0.2863917946815491, "learning_rate": 3.172434754430197e-05, "loss": 0.7814, "num_input_tokens_seen": 18272272, "step": 31690 }, { "epoch": 4.720732797140304, "grad_norm": 0.18553777039051056, "learning_rate": 3.1718087973093135e-05, "loss": 0.8049, "num_input_tokens_seen": 18275408, "step": 31695 }, { "epoch": 4.721477509681263, "grad_norm": 0.26338785886764526, "learning_rate": 3.171182794790322e-05, "loss": 0.8143, "num_input_tokens_seen": 18278448, "step": 31700 }, { "epoch": 4.722222222222222, "grad_norm": 0.18400123715400696, "learning_rate": 3.1705567469155266e-05, "loss": 0.7938, "num_input_tokens_seen": 18281136, "step": 31705 }, { "epoch": 4.722966934763181, "grad_norm": 0.34621235728263855, "learning_rate": 3.169930653727232e-05, "loss": 0.8255, "num_input_tokens_seen": 18284080, "step": 31710 }, { "epoch": 4.723711647304141, "grad_norm": 0.17974695563316345, "learning_rate": 3.169304515267748e-05, "loss": 0.7979, "num_input_tokens_seen": 18286992, "step": 31715 }, { "epoch": 4.7244563598450995, "grad_norm": 0.2150563895702362, "learning_rate": 3.168678331579387e-05, "loss": 0.7825, "num_input_tokens_seen": 18289936, "step": 31720 }, { "epoch": 4.725201072386059, "grad_norm": 0.2786967158317566, "learning_rate": 3.168052102704461e-05, "loss": 0.833, "num_input_tokens_seen": 18292752, "step": 31725 }, { "epoch": 4.725945784927018, "grad_norm": 0.2709062695503235, "learning_rate": 3.1674258286852906e-05, "loss": 0.8092, "num_input_tokens_seen": 18295728, "step": 31730 }, { "epoch": 4.7266904974679775, "grad_norm": 0.2586264908313751, "learning_rate": 3.1667995095641975e-05, "loss": 0.8285, "num_input_tokens_seen": 18298576, "step": 31735 }, { "epoch": 4.727435210008936, "grad_norm": 0.2077135592699051, "learning_rate": 3.1661731453835036e-05, "loss": 0.8265, "num_input_tokens_seen": 18301360, "step": 31740 }, { "epoch": 4.728179922549896, "grad_norm": 0.2702418565750122, "learning_rate": 3.165546736185537e-05, "loss": 0.8135, "num_input_tokens_seen": 18304880, "step": 31745 }, { "epoch": 4.728924635090855, "grad_norm": 0.1708785742521286, "learning_rate": 3.1649202820126275e-05, "loss": 0.7842, "num_input_tokens_seen": 18307664, "step": 31750 }, { "epoch": 4.729669347631814, "grad_norm": 0.1813923567533493, "learning_rate": 3.16429378290711e-05, "loss": 0.8167, "num_input_tokens_seen": 18310384, "step": 31755 }, { "epoch": 4.730414060172773, "grad_norm": 0.15511834621429443, "learning_rate": 3.1636672389113185e-05, "loss": 0.8039, "num_input_tokens_seen": 18313168, "step": 31760 }, { "epoch": 4.731158772713733, "grad_norm": 0.3564102351665497, "learning_rate": 3.163040650067593e-05, "loss": 0.8121, "num_input_tokens_seen": 18316560, "step": 31765 }, { "epoch": 4.7319034852546915, "grad_norm": 0.1477031707763672, "learning_rate": 3.162414016418277e-05, "loss": 0.8176, "num_input_tokens_seen": 18319280, "step": 31770 }, { "epoch": 4.732648197795651, "grad_norm": 0.2288050800561905, "learning_rate": 3.161787338005715e-05, "loss": 0.7776, "num_input_tokens_seen": 18322064, "step": 31775 }, { "epoch": 4.73339291033661, "grad_norm": 0.15642182528972626, "learning_rate": 3.161160614872254e-05, "loss": 0.8053, "num_input_tokens_seen": 18324912, "step": 31780 }, { "epoch": 4.73413762287757, "grad_norm": 0.27548471093177795, "learning_rate": 3.160533847060248e-05, "loss": 0.8007, "num_input_tokens_seen": 18327664, "step": 31785 }, { "epoch": 4.734882335418528, "grad_norm": 0.23449571430683136, "learning_rate": 3.1599070346120497e-05, "loss": 0.7967, "num_input_tokens_seen": 18330736, "step": 31790 }, { "epoch": 4.735627047959488, "grad_norm": 0.18945851922035217, "learning_rate": 3.1592801775700165e-05, "loss": 0.8423, "num_input_tokens_seen": 18333680, "step": 31795 }, { "epoch": 4.736371760500447, "grad_norm": 0.2156895399093628, "learning_rate": 3.1586532759765095e-05, "loss": 0.8002, "num_input_tokens_seen": 18336336, "step": 31800 }, { "epoch": 4.737116473041406, "grad_norm": 0.19300325214862823, "learning_rate": 3.158026329873893e-05, "loss": 0.7815, "num_input_tokens_seen": 18339344, "step": 31805 }, { "epoch": 4.737861185582365, "grad_norm": 0.2170170694589615, "learning_rate": 3.157399339304532e-05, "loss": 0.8056, "num_input_tokens_seen": 18342352, "step": 31810 }, { "epoch": 4.738605898123325, "grad_norm": 0.2620471715927124, "learning_rate": 3.1567723043107955e-05, "loss": 0.7967, "num_input_tokens_seen": 18345040, "step": 31815 }, { "epoch": 4.7393506106642835, "grad_norm": 0.18848922848701477, "learning_rate": 3.156145224935059e-05, "loss": 0.7788, "num_input_tokens_seen": 18348176, "step": 31820 }, { "epoch": 4.740095323205242, "grad_norm": 0.30845826864242554, "learning_rate": 3.1555181012196936e-05, "loss": 0.8258, "num_input_tokens_seen": 18350896, "step": 31825 }, { "epoch": 4.740840035746202, "grad_norm": 0.23164784908294678, "learning_rate": 3.154890933207081e-05, "loss": 0.7799, "num_input_tokens_seen": 18353680, "step": 31830 }, { "epoch": 4.741584748287162, "grad_norm": 0.23176252841949463, "learning_rate": 3.154263720939602e-05, "loss": 0.7803, "num_input_tokens_seen": 18356912, "step": 31835 }, { "epoch": 4.74232946082812, "grad_norm": 0.2812439203262329, "learning_rate": 3.15363646445964e-05, "loss": 0.8059, "num_input_tokens_seen": 18359856, "step": 31840 }, { "epoch": 4.743074173369079, "grad_norm": 0.20648764073848724, "learning_rate": 3.153009163809584e-05, "loss": 0.7887, "num_input_tokens_seen": 18362416, "step": 31845 }, { "epoch": 4.743818885910039, "grad_norm": 0.1924186497926712, "learning_rate": 3.1523818190318234e-05, "loss": 0.7729, "num_input_tokens_seen": 18365360, "step": 31850 }, { "epoch": 4.744563598450998, "grad_norm": 0.287628710269928, "learning_rate": 3.151754430168752e-05, "loss": 0.8004, "num_input_tokens_seen": 18367952, "step": 31855 }, { "epoch": 4.745308310991957, "grad_norm": 0.24208234250545502, "learning_rate": 3.151126997262766e-05, "loss": 0.7763, "num_input_tokens_seen": 18370736, "step": 31860 }, { "epoch": 4.746053023532916, "grad_norm": 0.22670528292655945, "learning_rate": 3.150499520356264e-05, "loss": 0.8014, "num_input_tokens_seen": 18373712, "step": 31865 }, { "epoch": 4.746797736073876, "grad_norm": 0.24864284694194794, "learning_rate": 3.1498719994916507e-05, "loss": 0.7733, "num_input_tokens_seen": 18376816, "step": 31870 }, { "epoch": 4.747542448614834, "grad_norm": 0.24185916781425476, "learning_rate": 3.149244434711328e-05, "loss": 0.8238, "num_input_tokens_seen": 18379536, "step": 31875 }, { "epoch": 4.748287161155794, "grad_norm": 0.19942836463451385, "learning_rate": 3.148616826057708e-05, "loss": 0.8049, "num_input_tokens_seen": 18382512, "step": 31880 }, { "epoch": 4.749031873696753, "grad_norm": 0.16146470606327057, "learning_rate": 3.147989173573199e-05, "loss": 0.8053, "num_input_tokens_seen": 18385232, "step": 31885 }, { "epoch": 4.749776586237712, "grad_norm": 0.24039407074451447, "learning_rate": 3.147361477300216e-05, "loss": 0.8376, "num_input_tokens_seen": 18388272, "step": 31890 }, { "epoch": 4.750521298778671, "grad_norm": 0.200409397482872, "learning_rate": 3.1467337372811764e-05, "loss": 0.8213, "num_input_tokens_seen": 18391440, "step": 31895 }, { "epoch": 4.751266011319631, "grad_norm": 0.20557056367397308, "learning_rate": 3.1461059535585e-05, "loss": 0.7747, "num_input_tokens_seen": 18394544, "step": 31900 }, { "epoch": 4.7520107238605895, "grad_norm": 0.2521958649158478, "learning_rate": 3.1454781261746114e-05, "loss": 0.8217, "num_input_tokens_seen": 18397840, "step": 31905 }, { "epoch": 4.752755436401549, "grad_norm": 0.26598143577575684, "learning_rate": 3.1448502551719336e-05, "loss": 0.7901, "num_input_tokens_seen": 18400496, "step": 31910 }, { "epoch": 4.753500148942508, "grad_norm": 0.20710451900959015, "learning_rate": 3.1442223405928985e-05, "loss": 0.7905, "num_input_tokens_seen": 18403408, "step": 31915 }, { "epoch": 4.754244861483468, "grad_norm": 0.22513136267662048, "learning_rate": 3.1435943824799375e-05, "loss": 0.8344, "num_input_tokens_seen": 18406384, "step": 31920 }, { "epoch": 4.754989574024426, "grad_norm": 0.2393045276403427, "learning_rate": 3.142966380875483e-05, "loss": 0.797, "num_input_tokens_seen": 18409232, "step": 31925 }, { "epoch": 4.755734286565386, "grad_norm": 0.16901569068431854, "learning_rate": 3.1423383358219756e-05, "loss": 0.8178, "num_input_tokens_seen": 18412144, "step": 31930 }, { "epoch": 4.756478999106345, "grad_norm": 0.28130653500556946, "learning_rate": 3.1417102473618554e-05, "loss": 0.8152, "num_input_tokens_seen": 18414896, "step": 31935 }, { "epoch": 4.757223711647304, "grad_norm": 0.19007468223571777, "learning_rate": 3.141082115537565e-05, "loss": 0.8095, "num_input_tokens_seen": 18417840, "step": 31940 }, { "epoch": 4.757968424188263, "grad_norm": 0.22726379334926605, "learning_rate": 3.1404539403915515e-05, "loss": 0.7931, "num_input_tokens_seen": 18420688, "step": 31945 }, { "epoch": 4.758713136729223, "grad_norm": 0.2531690299510956, "learning_rate": 3.139825721966265e-05, "loss": 0.8012, "num_input_tokens_seen": 18423312, "step": 31950 }, { "epoch": 4.759457849270182, "grad_norm": 0.1926729530096054, "learning_rate": 3.139197460304157e-05, "loss": 0.8026, "num_input_tokens_seen": 18425968, "step": 31955 }, { "epoch": 4.760202561811141, "grad_norm": 0.16298453509807587, "learning_rate": 3.138569155447685e-05, "loss": 0.773, "num_input_tokens_seen": 18428656, "step": 31960 }, { "epoch": 4.7609472743521, "grad_norm": 0.4728688895702362, "learning_rate": 3.137940807439304e-05, "loss": 0.7932, "num_input_tokens_seen": 18432816, "step": 31965 }, { "epoch": 4.76169198689306, "grad_norm": 0.20676597952842712, "learning_rate": 3.137312416321478e-05, "loss": 0.8141, "num_input_tokens_seen": 18435600, "step": 31970 }, { "epoch": 4.762436699434018, "grad_norm": 0.21034987270832062, "learning_rate": 3.1366839821366696e-05, "loss": 0.8095, "num_input_tokens_seen": 18438352, "step": 31975 }, { "epoch": 4.763181411974978, "grad_norm": 0.19743910431861877, "learning_rate": 3.136055504927347e-05, "loss": 0.8105, "num_input_tokens_seen": 18441200, "step": 31980 }, { "epoch": 4.763926124515937, "grad_norm": 0.20296046137809753, "learning_rate": 3.135426984735978e-05, "loss": 0.8031, "num_input_tokens_seen": 18444144, "step": 31985 }, { "epoch": 4.764670837056896, "grad_norm": 0.24843378365039825, "learning_rate": 3.134798421605037e-05, "loss": 0.7643, "num_input_tokens_seen": 18446928, "step": 31990 }, { "epoch": 4.765415549597855, "grad_norm": 0.2906402349472046, "learning_rate": 3.134169815577e-05, "loss": 0.8326, "num_input_tokens_seen": 18449872, "step": 31995 }, { "epoch": 4.766160262138815, "grad_norm": 0.1789439469575882, "learning_rate": 3.133541166694345e-05, "loss": 0.8106, "num_input_tokens_seen": 18453008, "step": 32000 }, { "epoch": 4.766904974679774, "grad_norm": 0.2817268371582031, "learning_rate": 3.132912474999555e-05, "loss": 0.8098, "num_input_tokens_seen": 18456368, "step": 32005 }, { "epoch": 4.767649687220732, "grad_norm": 0.21880356967449188, "learning_rate": 3.132283740535111e-05, "loss": 0.8026, "num_input_tokens_seen": 18459152, "step": 32010 }, { "epoch": 4.768394399761692, "grad_norm": 0.29259634017944336, "learning_rate": 3.131654963343504e-05, "loss": 0.7826, "num_input_tokens_seen": 18461840, "step": 32015 }, { "epoch": 4.769139112302652, "grad_norm": 0.28135165572166443, "learning_rate": 3.1310261434672234e-05, "loss": 0.832, "num_input_tokens_seen": 18464976, "step": 32020 }, { "epoch": 4.76988382484361, "grad_norm": 0.226827934384346, "learning_rate": 3.13039728094876e-05, "loss": 0.7902, "num_input_tokens_seen": 18467856, "step": 32025 }, { "epoch": 4.770628537384569, "grad_norm": 0.2270393669605255, "learning_rate": 3.129768375830612e-05, "loss": 0.8005, "num_input_tokens_seen": 18471248, "step": 32030 }, { "epoch": 4.771373249925529, "grad_norm": 0.17176583409309387, "learning_rate": 3.1291394281552776e-05, "loss": 0.767, "num_input_tokens_seen": 18473776, "step": 32035 }, { "epoch": 4.772117962466488, "grad_norm": 0.32750585675239563, "learning_rate": 3.128510437965259e-05, "loss": 0.8146, "num_input_tokens_seen": 18476720, "step": 32040 }, { "epoch": 4.772862675007447, "grad_norm": 0.20777267217636108, "learning_rate": 3.127881405303059e-05, "loss": 0.7837, "num_input_tokens_seen": 18479408, "step": 32045 }, { "epoch": 4.773607387548406, "grad_norm": 0.1739254742860794, "learning_rate": 3.127252330211187e-05, "loss": 0.8009, "num_input_tokens_seen": 18482192, "step": 32050 }, { "epoch": 4.774352100089366, "grad_norm": 0.2880675494670868, "learning_rate": 3.126623212732153e-05, "loss": 0.8136, "num_input_tokens_seen": 18485360, "step": 32055 }, { "epoch": 4.775096812630324, "grad_norm": 0.24096986651420593, "learning_rate": 3.12599405290847e-05, "loss": 0.8211, "num_input_tokens_seen": 18488336, "step": 32060 }, { "epoch": 4.775841525171284, "grad_norm": 0.23777616024017334, "learning_rate": 3.125364850782654e-05, "loss": 0.8246, "num_input_tokens_seen": 18491184, "step": 32065 }, { "epoch": 4.776586237712243, "grad_norm": 0.3289906978607178, "learning_rate": 3.124735606397224e-05, "loss": 0.8176, "num_input_tokens_seen": 18493808, "step": 32070 }, { "epoch": 4.777330950253202, "grad_norm": 0.22081276774406433, "learning_rate": 3.124106319794701e-05, "loss": 0.8335, "num_input_tokens_seen": 18496816, "step": 32075 }, { "epoch": 4.778075662794161, "grad_norm": 0.22837170958518982, "learning_rate": 3.123476991017611e-05, "loss": 0.7969, "num_input_tokens_seen": 18499472, "step": 32080 }, { "epoch": 4.778820375335121, "grad_norm": 0.23138673603534698, "learning_rate": 3.122847620108481e-05, "loss": 0.7953, "num_input_tokens_seen": 18502224, "step": 32085 }, { "epoch": 4.77956508787608, "grad_norm": 0.20681218802928925, "learning_rate": 3.122218207109841e-05, "loss": 0.8106, "num_input_tokens_seen": 18505104, "step": 32090 }, { "epoch": 4.780309800417039, "grad_norm": 0.32620009779930115, "learning_rate": 3.1215887520642237e-05, "loss": 0.7658, "num_input_tokens_seen": 18508080, "step": 32095 }, { "epoch": 4.781054512957998, "grad_norm": 0.17234130203723907, "learning_rate": 3.120959255014166e-05, "loss": 0.7991, "num_input_tokens_seen": 18510768, "step": 32100 }, { "epoch": 4.781799225498958, "grad_norm": 0.25690028071403503, "learning_rate": 3.120329716002208e-05, "loss": 0.8422, "num_input_tokens_seen": 18513840, "step": 32105 }, { "epoch": 4.782543938039916, "grad_norm": 0.19910921156406403, "learning_rate": 3.119700135070888e-05, "loss": 0.817, "num_input_tokens_seen": 18516624, "step": 32110 }, { "epoch": 4.783288650580876, "grad_norm": 0.19828952848911285, "learning_rate": 3.119070512262753e-05, "loss": 0.8238, "num_input_tokens_seen": 18519568, "step": 32115 }, { "epoch": 4.784033363121835, "grad_norm": 0.20020845532417297, "learning_rate": 3.1184408476203496e-05, "loss": 0.7978, "num_input_tokens_seen": 18522288, "step": 32120 }, { "epoch": 4.7847780756627944, "grad_norm": 0.19834758341312408, "learning_rate": 3.1178111411862285e-05, "loss": 0.8002, "num_input_tokens_seen": 18525136, "step": 32125 }, { "epoch": 4.785522788203753, "grad_norm": 0.23808124661445618, "learning_rate": 3.117181393002942e-05, "loss": 0.8038, "num_input_tokens_seen": 18528368, "step": 32130 }, { "epoch": 4.786267500744713, "grad_norm": 0.3074517548084259, "learning_rate": 3.116551603113046e-05, "loss": 0.772, "num_input_tokens_seen": 18531600, "step": 32135 }, { "epoch": 4.787012213285672, "grad_norm": 0.17392128705978394, "learning_rate": 3.1159217715591e-05, "loss": 0.8182, "num_input_tokens_seen": 18534576, "step": 32140 }, { "epoch": 4.787756925826631, "grad_norm": 0.2555955946445465, "learning_rate": 3.115291898383664e-05, "loss": 0.7916, "num_input_tokens_seen": 18537552, "step": 32145 }, { "epoch": 4.78850163836759, "grad_norm": 0.2799191474914551, "learning_rate": 3.114661983629304e-05, "loss": 0.822, "num_input_tokens_seen": 18540560, "step": 32150 }, { "epoch": 4.78924635090855, "grad_norm": 0.17735618352890015, "learning_rate": 3.114032027338585e-05, "loss": 0.793, "num_input_tokens_seen": 18543408, "step": 32155 }, { "epoch": 4.789991063449508, "grad_norm": 0.2276650071144104, "learning_rate": 3.113402029554079e-05, "loss": 0.825, "num_input_tokens_seen": 18546224, "step": 32160 }, { "epoch": 4.790735775990468, "grad_norm": 0.19769884645938873, "learning_rate": 3.112771990318358e-05, "loss": 0.7871, "num_input_tokens_seen": 18548976, "step": 32165 }, { "epoch": 4.791480488531427, "grad_norm": 0.273311972618103, "learning_rate": 3.112141909673997e-05, "loss": 0.7961, "num_input_tokens_seen": 18551952, "step": 32170 }, { "epoch": 4.792225201072386, "grad_norm": 0.21754010021686554, "learning_rate": 3.1115117876635735e-05, "loss": 0.7874, "num_input_tokens_seen": 18555024, "step": 32175 }, { "epoch": 4.792969913613345, "grad_norm": 0.2373591959476471, "learning_rate": 3.1108816243296716e-05, "loss": 0.8129, "num_input_tokens_seen": 18558096, "step": 32180 }, { "epoch": 4.793714626154305, "grad_norm": 0.24588312208652496, "learning_rate": 3.110251419714872e-05, "loss": 0.8033, "num_input_tokens_seen": 18561008, "step": 32185 }, { "epoch": 4.794459338695264, "grad_norm": 0.15518295764923096, "learning_rate": 3.109621173861762e-05, "loss": 0.7892, "num_input_tokens_seen": 18564688, "step": 32190 }, { "epoch": 4.795204051236222, "grad_norm": 0.23103046417236328, "learning_rate": 3.1089908868129316e-05, "loss": 0.8144, "num_input_tokens_seen": 18567728, "step": 32195 }, { "epoch": 4.795948763777182, "grad_norm": 0.24302169680595398, "learning_rate": 3.108360558610974e-05, "loss": 0.7806, "num_input_tokens_seen": 18570448, "step": 32200 }, { "epoch": 4.796693476318142, "grad_norm": 0.32283326983451843, "learning_rate": 3.1077301892984834e-05, "loss": 0.8197, "num_input_tokens_seen": 18573392, "step": 32205 }, { "epoch": 4.7974381888591004, "grad_norm": 0.18522216379642487, "learning_rate": 3.107099778918057e-05, "loss": 0.7914, "num_input_tokens_seen": 18576240, "step": 32210 }, { "epoch": 4.798182901400059, "grad_norm": 0.30239537358283997, "learning_rate": 3.106469327512296e-05, "loss": 0.8018, "num_input_tokens_seen": 18578864, "step": 32215 }, { "epoch": 4.798927613941019, "grad_norm": 0.26472678780555725, "learning_rate": 3.1058388351238035e-05, "loss": 0.7798, "num_input_tokens_seen": 18581680, "step": 32220 }, { "epoch": 4.799672326481978, "grad_norm": 0.22815975546836853, "learning_rate": 3.105208301795185e-05, "loss": 0.7969, "num_input_tokens_seen": 18584496, "step": 32225 }, { "epoch": 4.800417039022937, "grad_norm": 0.24856601655483246, "learning_rate": 3.1045777275690505e-05, "loss": 0.8009, "num_input_tokens_seen": 18587184, "step": 32230 }, { "epoch": 4.801161751563896, "grad_norm": 0.31732991337776184, "learning_rate": 3.1039471124880114e-05, "loss": 0.793, "num_input_tokens_seen": 18590352, "step": 32235 }, { "epoch": 4.801906464104856, "grad_norm": 0.22669851779937744, "learning_rate": 3.103316456594683e-05, "loss": 0.8061, "num_input_tokens_seen": 18593296, "step": 32240 }, { "epoch": 4.802651176645814, "grad_norm": 0.3812701106071472, "learning_rate": 3.1026857599316795e-05, "loss": 0.8083, "num_input_tokens_seen": 18596112, "step": 32245 }, { "epoch": 4.803395889186774, "grad_norm": 0.21342895925045013, "learning_rate": 3.102055022541623e-05, "loss": 0.7748, "num_input_tokens_seen": 18600240, "step": 32250 }, { "epoch": 4.804140601727733, "grad_norm": 0.18967224657535553, "learning_rate": 3.1014242444671366e-05, "loss": 0.8172, "num_input_tokens_seen": 18603088, "step": 32255 }, { "epoch": 4.8048853142686925, "grad_norm": 0.19776730239391327, "learning_rate": 3.100793425750845e-05, "loss": 0.7883, "num_input_tokens_seen": 18605808, "step": 32260 }, { "epoch": 4.805630026809651, "grad_norm": 0.28963184356689453, "learning_rate": 3.100162566435375e-05, "loss": 0.8012, "num_input_tokens_seen": 18608720, "step": 32265 }, { "epoch": 4.806374739350611, "grad_norm": 0.2501569092273712, "learning_rate": 3.0995316665633606e-05, "loss": 0.8078, "num_input_tokens_seen": 18611568, "step": 32270 }, { "epoch": 4.80711945189157, "grad_norm": 0.1653212308883667, "learning_rate": 3.098900726177432e-05, "loss": 0.8119, "num_input_tokens_seen": 18614416, "step": 32275 }, { "epoch": 4.807864164432529, "grad_norm": 0.23998002707958221, "learning_rate": 3.0982697453202284e-05, "loss": 0.8199, "num_input_tokens_seen": 18617232, "step": 32280 }, { "epoch": 4.808608876973488, "grad_norm": 0.22378148138523102, "learning_rate": 3.0976387240343886e-05, "loss": 0.7821, "num_input_tokens_seen": 18620080, "step": 32285 }, { "epoch": 4.809353589514448, "grad_norm": 0.2514478266239166, "learning_rate": 3.097007662362552e-05, "loss": 0.7977, "num_input_tokens_seen": 18622960, "step": 32290 }, { "epoch": 4.8100983020554064, "grad_norm": 0.3491954803466797, "learning_rate": 3.096376560347365e-05, "loss": 0.8314, "num_input_tokens_seen": 18625936, "step": 32295 }, { "epoch": 4.810843014596366, "grad_norm": 0.21874575316905975, "learning_rate": 3.095745418031476e-05, "loss": 0.7997, "num_input_tokens_seen": 18629072, "step": 32300 }, { "epoch": 4.811587727137325, "grad_norm": 0.16324840486049652, "learning_rate": 3.095114235457533e-05, "loss": 0.8082, "num_input_tokens_seen": 18631952, "step": 32305 }, { "epoch": 4.8123324396782845, "grad_norm": 0.21154461801052094, "learning_rate": 3.094483012668189e-05, "loss": 0.8045, "num_input_tokens_seen": 18634928, "step": 32310 }, { "epoch": 4.813077152219243, "grad_norm": 0.2455022782087326, "learning_rate": 3.093851749706101e-05, "loss": 0.7905, "num_input_tokens_seen": 18637648, "step": 32315 }, { "epoch": 4.813821864760203, "grad_norm": 0.20852041244506836, "learning_rate": 3.093220446613926e-05, "loss": 0.8136, "num_input_tokens_seen": 18640368, "step": 32320 }, { "epoch": 4.814566577301162, "grad_norm": 0.1730424463748932, "learning_rate": 3.092589103434324e-05, "loss": 0.7879, "num_input_tokens_seen": 18643536, "step": 32325 }, { "epoch": 4.815311289842121, "grad_norm": 0.30945315957069397, "learning_rate": 3.0919577202099606e-05, "loss": 0.8132, "num_input_tokens_seen": 18646608, "step": 32330 }, { "epoch": 4.81605600238308, "grad_norm": 0.19697874784469604, "learning_rate": 3.091326296983501e-05, "loss": 0.7909, "num_input_tokens_seen": 18649456, "step": 32335 }, { "epoch": 4.816800714924039, "grad_norm": 0.26874667406082153, "learning_rate": 3.0906948337976146e-05, "loss": 0.7921, "num_input_tokens_seen": 18652368, "step": 32340 }, { "epoch": 4.8175454274649985, "grad_norm": 0.2346685528755188, "learning_rate": 3.090063330694972e-05, "loss": 0.8391, "num_input_tokens_seen": 18655312, "step": 32345 }, { "epoch": 4.818290140005958, "grad_norm": 0.2492653876543045, "learning_rate": 3.08943178771825e-05, "loss": 0.7893, "num_input_tokens_seen": 18658128, "step": 32350 }, { "epoch": 4.819034852546917, "grad_norm": 0.1941772848367691, "learning_rate": 3.088800204910123e-05, "loss": 0.8171, "num_input_tokens_seen": 18661040, "step": 32355 }, { "epoch": 4.819779565087876, "grad_norm": 0.18991893529891968, "learning_rate": 3.088168582313273e-05, "loss": 0.8219, "num_input_tokens_seen": 18663632, "step": 32360 }, { "epoch": 4.820524277628835, "grad_norm": 0.18867480754852295, "learning_rate": 3.087536919970381e-05, "loss": 0.7701, "num_input_tokens_seen": 18666352, "step": 32365 }, { "epoch": 4.821268990169795, "grad_norm": 0.23878097534179688, "learning_rate": 3.0869052179241334e-05, "loss": 0.8355, "num_input_tokens_seen": 18669136, "step": 32370 }, { "epoch": 4.822013702710754, "grad_norm": 0.23696771264076233, "learning_rate": 3.0862734762172164e-05, "loss": 0.7842, "num_input_tokens_seen": 18672016, "step": 32375 }, { "epoch": 4.8227584152517124, "grad_norm": 0.1701732873916626, "learning_rate": 3.085641694892322e-05, "loss": 0.85, "num_input_tokens_seen": 18675088, "step": 32380 }, { "epoch": 4.823503127792672, "grad_norm": 0.27047571539878845, "learning_rate": 3.085009873992143e-05, "loss": 0.7908, "num_input_tokens_seen": 18678000, "step": 32385 }, { "epoch": 4.824247840333631, "grad_norm": 0.19799140095710754, "learning_rate": 3.084378013559374e-05, "loss": 0.8196, "num_input_tokens_seen": 18680560, "step": 32390 }, { "epoch": 4.8249925528745905, "grad_norm": 0.1638573259115219, "learning_rate": 3.083746113636716e-05, "loss": 0.8069, "num_input_tokens_seen": 18683376, "step": 32395 }, { "epoch": 4.825737265415549, "grad_norm": 0.23722957074642181, "learning_rate": 3.083114174266869e-05, "loss": 0.7906, "num_input_tokens_seen": 18686608, "step": 32400 }, { "epoch": 4.826481977956509, "grad_norm": 0.2578582763671875, "learning_rate": 3.082482195492536e-05, "loss": 0.7864, "num_input_tokens_seen": 18689776, "step": 32405 }, { "epoch": 4.827226690497468, "grad_norm": 0.3674193024635315, "learning_rate": 3.081850177356425e-05, "loss": 0.8129, "num_input_tokens_seen": 18692880, "step": 32410 }, { "epoch": 4.827971403038427, "grad_norm": 0.27215638756752014, "learning_rate": 3.0812181199012455e-05, "loss": 0.7983, "num_input_tokens_seen": 18696048, "step": 32415 }, { "epoch": 4.828716115579386, "grad_norm": 0.19304050505161285, "learning_rate": 3.080586023169707e-05, "loss": 0.8215, "num_input_tokens_seen": 18698992, "step": 32420 }, { "epoch": 4.829460828120346, "grad_norm": 0.22993476688861847, "learning_rate": 3.079953887204527e-05, "loss": 0.828, "num_input_tokens_seen": 18701776, "step": 32425 }, { "epoch": 4.8302055406613045, "grad_norm": 0.188383087515831, "learning_rate": 3.07932171204842e-05, "loss": 0.7638, "num_input_tokens_seen": 18704752, "step": 32430 }, { "epoch": 4.830950253202264, "grad_norm": 0.3015187978744507, "learning_rate": 3.0786894977441074e-05, "loss": 0.8133, "num_input_tokens_seen": 18707664, "step": 32435 }, { "epoch": 4.831694965743223, "grad_norm": 0.2858906388282776, "learning_rate": 3.078057244334311e-05, "loss": 0.8258, "num_input_tokens_seen": 18710320, "step": 32440 }, { "epoch": 4.8324396782841825, "grad_norm": 0.24545294046401978, "learning_rate": 3.077424951861757e-05, "loss": 0.7981, "num_input_tokens_seen": 18713456, "step": 32445 }, { "epoch": 4.833184390825141, "grad_norm": 0.16325655579566956, "learning_rate": 3.0767926203691724e-05, "loss": 0.8447, "num_input_tokens_seen": 18716304, "step": 32450 }, { "epoch": 4.833929103366101, "grad_norm": 0.20974883437156677, "learning_rate": 3.076160249899286e-05, "loss": 0.8213, "num_input_tokens_seen": 18719312, "step": 32455 }, { "epoch": 4.83467381590706, "grad_norm": 0.24434994161128998, "learning_rate": 3.075527840494834e-05, "loss": 0.8005, "num_input_tokens_seen": 18722128, "step": 32460 }, { "epoch": 4.835418528448019, "grad_norm": 0.27171242237091064, "learning_rate": 3.074895392198551e-05, "loss": 0.8066, "num_input_tokens_seen": 18725136, "step": 32465 }, { "epoch": 4.836163240988978, "grad_norm": 0.47539934515953064, "learning_rate": 3.074262905053173e-05, "loss": 0.8449, "num_input_tokens_seen": 18728432, "step": 32470 }, { "epoch": 4.836907953529938, "grad_norm": 0.28407150506973267, "learning_rate": 3.073630379101443e-05, "loss": 0.8205, "num_input_tokens_seen": 18731408, "step": 32475 }, { "epoch": 4.8376526660708965, "grad_norm": 0.25347355008125305, "learning_rate": 3.072997814386106e-05, "loss": 0.7833, "num_input_tokens_seen": 18734320, "step": 32480 }, { "epoch": 4.838397378611856, "grad_norm": 0.2162007838487625, "learning_rate": 3.0723652109499046e-05, "loss": 0.8262, "num_input_tokens_seen": 18737104, "step": 32485 }, { "epoch": 4.839142091152815, "grad_norm": 0.20996399223804474, "learning_rate": 3.0717325688355893e-05, "loss": 0.8009, "num_input_tokens_seen": 18740144, "step": 32490 }, { "epoch": 4.8398868036937746, "grad_norm": 0.17384743690490723, "learning_rate": 3.071099888085911e-05, "loss": 0.7734, "num_input_tokens_seen": 18742928, "step": 32495 }, { "epoch": 4.840631516234733, "grad_norm": 0.21408985555171967, "learning_rate": 3.070467168743626e-05, "loss": 0.7949, "num_input_tokens_seen": 18745680, "step": 32500 }, { "epoch": 4.841376228775693, "grad_norm": 0.14171692728996277, "learning_rate": 3.0698344108514886e-05, "loss": 0.7875, "num_input_tokens_seen": 18748560, "step": 32505 }, { "epoch": 4.842120941316652, "grad_norm": 0.27551451325416565, "learning_rate": 3.069201614452258e-05, "loss": 0.797, "num_input_tokens_seen": 18751440, "step": 32510 }, { "epoch": 4.842865653857611, "grad_norm": 0.2634374499320984, "learning_rate": 3.0685687795886964e-05, "loss": 0.8098, "num_input_tokens_seen": 18754544, "step": 32515 }, { "epoch": 4.84361036639857, "grad_norm": 0.2922663688659668, "learning_rate": 3.067935906303568e-05, "loss": 0.7971, "num_input_tokens_seen": 18757232, "step": 32520 }, { "epoch": 4.844355078939529, "grad_norm": 0.2549721598625183, "learning_rate": 3.0673029946396406e-05, "loss": 0.782, "num_input_tokens_seen": 18759824, "step": 32525 }, { "epoch": 4.8450997914804885, "grad_norm": 0.2802247703075409, "learning_rate": 3.0666700446396835e-05, "loss": 0.8223, "num_input_tokens_seen": 18762864, "step": 32530 }, { "epoch": 4.845844504021448, "grad_norm": 0.21370606124401093, "learning_rate": 3.0660370563464694e-05, "loss": 0.8089, "num_input_tokens_seen": 18765584, "step": 32535 }, { "epoch": 4.846589216562407, "grad_norm": 0.26495829224586487, "learning_rate": 3.065404029802771e-05, "loss": 0.8137, "num_input_tokens_seen": 18768752, "step": 32540 }, { "epoch": 4.847333929103366, "grad_norm": 0.21070140600204468, "learning_rate": 3.064770965051367e-05, "loss": 0.7804, "num_input_tokens_seen": 18771856, "step": 32545 }, { "epoch": 4.848078641644325, "grad_norm": 0.2293006330728531, "learning_rate": 3.0641378621350384e-05, "loss": 0.8199, "num_input_tokens_seen": 18774640, "step": 32550 }, { "epoch": 4.848823354185284, "grad_norm": 0.18575970828533173, "learning_rate": 3.063504721096566e-05, "loss": 0.7933, "num_input_tokens_seen": 18777424, "step": 32555 }, { "epoch": 4.849568066726244, "grad_norm": 0.15667258203029633, "learning_rate": 3.0628715419787355e-05, "loss": 0.7716, "num_input_tokens_seen": 18780208, "step": 32560 }, { "epoch": 4.8503127792672025, "grad_norm": 0.3475860059261322, "learning_rate": 3.062238324824336e-05, "loss": 0.8404, "num_input_tokens_seen": 18783152, "step": 32565 }, { "epoch": 4.851057491808162, "grad_norm": 0.2102217674255371, "learning_rate": 3.061605069676155e-05, "loss": 0.8012, "num_input_tokens_seen": 18786288, "step": 32570 }, { "epoch": 4.851802204349121, "grad_norm": 0.18294523656368256, "learning_rate": 3.0609717765769866e-05, "loss": 0.786, "num_input_tokens_seen": 18789136, "step": 32575 }, { "epoch": 4.8525469168900806, "grad_norm": 0.2256273776292801, "learning_rate": 3.060338445569627e-05, "loss": 0.8107, "num_input_tokens_seen": 18792016, "step": 32580 }, { "epoch": 4.853291629431039, "grad_norm": 0.22116878628730774, "learning_rate": 3.059705076696873e-05, "loss": 0.8276, "num_input_tokens_seen": 18794896, "step": 32585 }, { "epoch": 4.854036341971999, "grad_norm": 0.27911829948425293, "learning_rate": 3.059071670001526e-05, "loss": 0.825, "num_input_tokens_seen": 18797552, "step": 32590 }, { "epoch": 4.854781054512958, "grad_norm": 0.16510525345802307, "learning_rate": 3.058438225526388e-05, "loss": 0.7893, "num_input_tokens_seen": 18800080, "step": 32595 }, { "epoch": 4.855525767053917, "grad_norm": 0.1862395703792572, "learning_rate": 3.057804743314266e-05, "loss": 0.8139, "num_input_tokens_seen": 18803056, "step": 32600 }, { "epoch": 4.856270479594876, "grad_norm": 0.19094538688659668, "learning_rate": 3.0571712234079666e-05, "loss": 0.7647, "num_input_tokens_seen": 18806000, "step": 32605 }, { "epoch": 4.857015192135836, "grad_norm": 0.2490980327129364, "learning_rate": 3.0565376658503e-05, "loss": 0.8285, "num_input_tokens_seen": 18808752, "step": 32610 }, { "epoch": 4.8577599046767945, "grad_norm": 0.2309640645980835, "learning_rate": 3.055904070684082e-05, "loss": 0.8353, "num_input_tokens_seen": 18811248, "step": 32615 }, { "epoch": 4.858504617217754, "grad_norm": 0.21787267923355103, "learning_rate": 3.055270437952127e-05, "loss": 0.7767, "num_input_tokens_seen": 18814544, "step": 32620 }, { "epoch": 4.859249329758713, "grad_norm": 0.24631518125534058, "learning_rate": 3.054636767697254e-05, "loss": 0.8168, "num_input_tokens_seen": 18817392, "step": 32625 }, { "epoch": 4.859994042299673, "grad_norm": 0.2165374606847763, "learning_rate": 3.054003059962283e-05, "loss": 0.8163, "num_input_tokens_seen": 18820336, "step": 32630 }, { "epoch": 4.860738754840631, "grad_norm": 0.19801932573318481, "learning_rate": 3.0533693147900365e-05, "loss": 0.803, "num_input_tokens_seen": 18823184, "step": 32635 }, { "epoch": 4.861483467381591, "grad_norm": 0.27032962441444397, "learning_rate": 3.052735532223342e-05, "loss": 0.7979, "num_input_tokens_seen": 18826288, "step": 32640 }, { "epoch": 4.86222817992255, "grad_norm": 0.23311468958854675, "learning_rate": 3.052101712305028e-05, "loss": 0.8255, "num_input_tokens_seen": 18828944, "step": 32645 }, { "epoch": 4.862972892463509, "grad_norm": 0.1922559291124344, "learning_rate": 3.051467855077925e-05, "loss": 0.8164, "num_input_tokens_seen": 18831824, "step": 32650 }, { "epoch": 4.863717605004468, "grad_norm": 0.16322611272335052, "learning_rate": 3.050833960584866e-05, "loss": 0.7784, "num_input_tokens_seen": 18834736, "step": 32655 }, { "epoch": 4.864462317545428, "grad_norm": 0.21889959275722504, "learning_rate": 3.0502000288686877e-05, "loss": 0.8224, "num_input_tokens_seen": 18837648, "step": 32660 }, { "epoch": 4.8652070300863866, "grad_norm": 0.23941107094287872, "learning_rate": 3.0495660599722292e-05, "loss": 0.8032, "num_input_tokens_seen": 18840848, "step": 32665 }, { "epoch": 4.865951742627346, "grad_norm": 0.266421914100647, "learning_rate": 3.0489320539383294e-05, "loss": 0.8212, "num_input_tokens_seen": 18843568, "step": 32670 }, { "epoch": 4.866696455168305, "grad_norm": 0.14818909764289856, "learning_rate": 3.0482980108098336e-05, "loss": 0.7918, "num_input_tokens_seen": 18846256, "step": 32675 }, { "epoch": 4.867441167709265, "grad_norm": 0.26658543944358826, "learning_rate": 3.0476639306295874e-05, "loss": 0.7876, "num_input_tokens_seen": 18848976, "step": 32680 }, { "epoch": 4.868185880250223, "grad_norm": 0.2186431586742401, "learning_rate": 3.0470298134404403e-05, "loss": 0.793, "num_input_tokens_seen": 18851600, "step": 32685 }, { "epoch": 4.868930592791182, "grad_norm": 0.2390393614768982, "learning_rate": 3.0463956592852412e-05, "loss": 0.8239, "num_input_tokens_seen": 18854384, "step": 32690 }, { "epoch": 4.869675305332142, "grad_norm": 0.21614669263362885, "learning_rate": 3.0457614682068452e-05, "loss": 0.7849, "num_input_tokens_seen": 18857168, "step": 32695 }, { "epoch": 4.870420017873101, "grad_norm": 0.1511821299791336, "learning_rate": 3.0451272402481086e-05, "loss": 0.8034, "num_input_tokens_seen": 18859792, "step": 32700 }, { "epoch": 4.87116473041406, "grad_norm": 0.21967031061649323, "learning_rate": 3.044492975451889e-05, "loss": 0.7776, "num_input_tokens_seen": 18862704, "step": 32705 }, { "epoch": 4.871909442955019, "grad_norm": 0.20561975240707397, "learning_rate": 3.0438586738610482e-05, "loss": 0.8039, "num_input_tokens_seen": 18865616, "step": 32710 }, { "epoch": 4.872654155495979, "grad_norm": 0.2645452916622162, "learning_rate": 3.0432243355184494e-05, "loss": 0.8247, "num_input_tokens_seen": 18868336, "step": 32715 }, { "epoch": 4.873398868036938, "grad_norm": 0.18849384784698486, "learning_rate": 3.0425899604669577e-05, "loss": 0.806, "num_input_tokens_seen": 18871024, "step": 32720 }, { "epoch": 4.874143580577897, "grad_norm": 0.2647324204444885, "learning_rate": 3.041955548749444e-05, "loss": 0.7707, "num_input_tokens_seen": 18874000, "step": 32725 }, { "epoch": 4.874888293118856, "grad_norm": 0.30076971650123596, "learning_rate": 3.0413211004087773e-05, "loss": 0.8063, "num_input_tokens_seen": 18876848, "step": 32730 }, { "epoch": 4.875633005659815, "grad_norm": 0.2480095773935318, "learning_rate": 3.0406866154878306e-05, "loss": 0.7852, "num_input_tokens_seen": 18879888, "step": 32735 }, { "epoch": 4.876377718200774, "grad_norm": 0.2190619856119156, "learning_rate": 3.0400520940294808e-05, "loss": 0.8024, "num_input_tokens_seen": 18882992, "step": 32740 }, { "epoch": 4.877122430741734, "grad_norm": 0.2566284239292145, "learning_rate": 3.039417536076607e-05, "loss": 0.8131, "num_input_tokens_seen": 18886000, "step": 32745 }, { "epoch": 4.8778671432826926, "grad_norm": 0.20737436413764954, "learning_rate": 3.0387829416720888e-05, "loss": 0.8069, "num_input_tokens_seen": 18888912, "step": 32750 }, { "epoch": 4.878611855823652, "grad_norm": 0.25800931453704834, "learning_rate": 3.0381483108588093e-05, "loss": 0.7805, "num_input_tokens_seen": 18891632, "step": 32755 }, { "epoch": 4.879356568364611, "grad_norm": 0.2049119621515274, "learning_rate": 3.037513643679656e-05, "loss": 0.7958, "num_input_tokens_seen": 18894160, "step": 32760 }, { "epoch": 4.880101280905571, "grad_norm": 0.39527520537376404, "learning_rate": 3.036878940177516e-05, "loss": 0.8136, "num_input_tokens_seen": 18897360, "step": 32765 }, { "epoch": 4.880845993446529, "grad_norm": 0.2533290684223175, "learning_rate": 3.0362442003952795e-05, "loss": 0.7996, "num_input_tokens_seen": 18900176, "step": 32770 }, { "epoch": 4.881590705987489, "grad_norm": 0.20001181960105896, "learning_rate": 3.03560942437584e-05, "loss": 0.7958, "num_input_tokens_seen": 18902768, "step": 32775 }, { "epoch": 4.882335418528448, "grad_norm": 0.2319639027118683, "learning_rate": 3.0349746121620935e-05, "loss": 0.778, "num_input_tokens_seen": 18905424, "step": 32780 }, { "epoch": 4.883080131069407, "grad_norm": 0.2770184874534607, "learning_rate": 3.034339763796938e-05, "loss": 0.8129, "num_input_tokens_seen": 18908144, "step": 32785 }, { "epoch": 4.883824843610366, "grad_norm": 0.23484700918197632, "learning_rate": 3.033704879323273e-05, "loss": 0.8323, "num_input_tokens_seen": 18910864, "step": 32790 }, { "epoch": 4.884569556151326, "grad_norm": 0.3394227623939514, "learning_rate": 3.0330699587840027e-05, "loss": 0.8016, "num_input_tokens_seen": 18913808, "step": 32795 }, { "epoch": 4.885314268692285, "grad_norm": 0.26527073979377747, "learning_rate": 3.0324350022220317e-05, "loss": 0.7911, "num_input_tokens_seen": 18916752, "step": 32800 }, { "epoch": 4.886058981233244, "grad_norm": 0.28213444352149963, "learning_rate": 3.0318000096802686e-05, "loss": 0.8023, "num_input_tokens_seen": 18919568, "step": 32805 }, { "epoch": 4.886803693774203, "grad_norm": 0.2138625830411911, "learning_rate": 3.031164981201622e-05, "loss": 0.807, "num_input_tokens_seen": 18922448, "step": 32810 }, { "epoch": 4.887548406315163, "grad_norm": 0.21346038579940796, "learning_rate": 3.0305299168290064e-05, "loss": 0.8039, "num_input_tokens_seen": 18925360, "step": 32815 }, { "epoch": 4.888293118856121, "grad_norm": 0.24162358045578003, "learning_rate": 3.0298948166053352e-05, "loss": 0.7913, "num_input_tokens_seen": 18928496, "step": 32820 }, { "epoch": 4.889037831397081, "grad_norm": 0.19511327147483826, "learning_rate": 3.0292596805735274e-05, "loss": 0.7851, "num_input_tokens_seen": 18931120, "step": 32825 }, { "epoch": 4.88978254393804, "grad_norm": 0.19957008957862854, "learning_rate": 3.028624508776502e-05, "loss": 0.7921, "num_input_tokens_seen": 18934192, "step": 32830 }, { "epoch": 4.890527256478999, "grad_norm": 0.17432045936584473, "learning_rate": 3.0279893012571807e-05, "loss": 0.8356, "num_input_tokens_seen": 18936880, "step": 32835 }, { "epoch": 4.891271969019958, "grad_norm": 0.23078182339668274, "learning_rate": 3.0273540580584897e-05, "loss": 0.766, "num_input_tokens_seen": 18939856, "step": 32840 }, { "epoch": 4.892016681560918, "grad_norm": 0.21514269709587097, "learning_rate": 3.026718779223356e-05, "loss": 0.8198, "num_input_tokens_seen": 18942544, "step": 32845 }, { "epoch": 4.892761394101877, "grad_norm": 0.2541683614253998, "learning_rate": 3.0260834647947085e-05, "loss": 0.7558, "num_input_tokens_seen": 18945680, "step": 32850 }, { "epoch": 4.893506106642836, "grad_norm": 0.2177531123161316, "learning_rate": 3.0254481148154788e-05, "loss": 0.7861, "num_input_tokens_seen": 18948656, "step": 32855 }, { "epoch": 4.894250819183795, "grad_norm": 0.2365901619195938, "learning_rate": 3.0248127293286022e-05, "loss": 0.8082, "num_input_tokens_seen": 18951312, "step": 32860 }, { "epoch": 4.894995531724755, "grad_norm": 0.2316892147064209, "learning_rate": 3.0241773083770154e-05, "loss": 0.8216, "num_input_tokens_seen": 18954384, "step": 32865 }, { "epoch": 4.895740244265713, "grad_norm": 0.27768269181251526, "learning_rate": 3.0235418520036567e-05, "loss": 0.792, "num_input_tokens_seen": 18957360, "step": 32870 }, { "epoch": 4.896484956806672, "grad_norm": 0.19028101861476898, "learning_rate": 3.0229063602514678e-05, "loss": 0.8139, "num_input_tokens_seen": 18960112, "step": 32875 }, { "epoch": 4.897229669347632, "grad_norm": 0.1863243281841278, "learning_rate": 3.022270833163394e-05, "loss": 0.7944, "num_input_tokens_seen": 18962864, "step": 32880 }, { "epoch": 4.8979743818885915, "grad_norm": 0.15938115119934082, "learning_rate": 3.0216352707823807e-05, "loss": 0.8239, "num_input_tokens_seen": 18965552, "step": 32885 }, { "epoch": 4.89871909442955, "grad_norm": 0.23764647543430328, "learning_rate": 3.0209996731513757e-05, "loss": 0.7886, "num_input_tokens_seen": 18968592, "step": 32890 }, { "epoch": 4.899463806970509, "grad_norm": 0.2450103610754013, "learning_rate": 3.020364040313332e-05, "loss": 0.7867, "num_input_tokens_seen": 18971568, "step": 32895 }, { "epoch": 4.900208519511469, "grad_norm": 0.19717562198638916, "learning_rate": 3.0197283723112013e-05, "loss": 0.8272, "num_input_tokens_seen": 18974288, "step": 32900 }, { "epoch": 4.900953232052427, "grad_norm": 0.21804237365722656, "learning_rate": 3.0190926691879412e-05, "loss": 0.7978, "num_input_tokens_seen": 18977008, "step": 32905 }, { "epoch": 4.901697944593387, "grad_norm": 0.23370443284511566, "learning_rate": 3.018456930986508e-05, "loss": 0.8088, "num_input_tokens_seen": 18979824, "step": 32910 }, { "epoch": 4.902442657134346, "grad_norm": 0.16100992262363434, "learning_rate": 3.017821157749864e-05, "loss": 0.8317, "num_input_tokens_seen": 18982544, "step": 32915 }, { "epoch": 4.903187369675305, "grad_norm": 0.27687862515449524, "learning_rate": 3.0171853495209708e-05, "loss": 0.8113, "num_input_tokens_seen": 18985072, "step": 32920 }, { "epoch": 4.903932082216264, "grad_norm": 0.20110590755939484, "learning_rate": 3.0165495063427952e-05, "loss": 0.7842, "num_input_tokens_seen": 18987984, "step": 32925 }, { "epoch": 4.904676794757224, "grad_norm": 0.22947917878627777, "learning_rate": 3.0159136282583038e-05, "loss": 0.8074, "num_input_tokens_seen": 18990800, "step": 32930 }, { "epoch": 4.905421507298183, "grad_norm": 0.25507593154907227, "learning_rate": 3.0152777153104665e-05, "loss": 0.7956, "num_input_tokens_seen": 18993808, "step": 32935 }, { "epoch": 4.906166219839142, "grad_norm": 0.19716346263885498, "learning_rate": 3.014641767542256e-05, "loss": 0.8072, "num_input_tokens_seen": 18996784, "step": 32940 }, { "epoch": 4.906910932380101, "grad_norm": 0.17838697135448456, "learning_rate": 3.014005784996648e-05, "loss": 0.8117, "num_input_tokens_seen": 18999888, "step": 32945 }, { "epoch": 4.907655644921061, "grad_norm": 0.19862781465053558, "learning_rate": 3.013369767716619e-05, "loss": 0.8124, "num_input_tokens_seen": 19002704, "step": 32950 }, { "epoch": 4.908400357462019, "grad_norm": 0.22142624855041504, "learning_rate": 3.0127337157451475e-05, "loss": 0.8048, "num_input_tokens_seen": 19005520, "step": 32955 }, { "epoch": 4.909145070002979, "grad_norm": 0.23046323657035828, "learning_rate": 3.0120976291252167e-05, "loss": 0.8066, "num_input_tokens_seen": 19008528, "step": 32960 }, { "epoch": 4.909889782543938, "grad_norm": 0.2817821502685547, "learning_rate": 3.0114615078998103e-05, "loss": 0.8016, "num_input_tokens_seen": 19011568, "step": 32965 }, { "epoch": 4.9106344950848975, "grad_norm": 0.16919079422950745, "learning_rate": 3.010825352111914e-05, "loss": 0.8176, "num_input_tokens_seen": 19014448, "step": 32970 }, { "epoch": 4.911379207625856, "grad_norm": 0.2057926505804062, "learning_rate": 3.0101891618045175e-05, "loss": 0.8286, "num_input_tokens_seen": 19017328, "step": 32975 }, { "epoch": 4.912123920166816, "grad_norm": 0.19913239777088165, "learning_rate": 3.009552937020612e-05, "loss": 0.808, "num_input_tokens_seen": 19020016, "step": 32980 }, { "epoch": 4.912868632707775, "grad_norm": 0.23224666714668274, "learning_rate": 3.008916677803191e-05, "loss": 0.8, "num_input_tokens_seen": 19023120, "step": 32985 }, { "epoch": 4.913613345248734, "grad_norm": 0.2510702610015869, "learning_rate": 3.008280384195249e-05, "loss": 0.789, "num_input_tokens_seen": 19025936, "step": 32990 }, { "epoch": 4.914358057789693, "grad_norm": 0.2429235577583313, "learning_rate": 3.0076440562397857e-05, "loss": 0.7928, "num_input_tokens_seen": 19029584, "step": 32995 }, { "epoch": 4.915102770330653, "grad_norm": 0.22985313832759857, "learning_rate": 3.007007693979801e-05, "loss": 0.7927, "num_input_tokens_seen": 19032496, "step": 33000 }, { "epoch": 4.915847482871611, "grad_norm": 0.21930328011512756, "learning_rate": 3.006371297458297e-05, "loss": 0.7884, "num_input_tokens_seen": 19035408, "step": 33005 }, { "epoch": 4.916592195412571, "grad_norm": 0.22285230457782745, "learning_rate": 3.0057348667182806e-05, "loss": 0.8107, "num_input_tokens_seen": 19038288, "step": 33010 }, { "epoch": 4.91733690795353, "grad_norm": 0.3499400317668915, "learning_rate": 3.005098401802758e-05, "loss": 0.8127, "num_input_tokens_seen": 19041296, "step": 33015 }, { "epoch": 4.9180816204944895, "grad_norm": 0.2407611459493637, "learning_rate": 3.0044619027547384e-05, "loss": 0.8289, "num_input_tokens_seen": 19044368, "step": 33020 }, { "epoch": 4.918826333035448, "grad_norm": 0.20321759581565857, "learning_rate": 3.0038253696172342e-05, "loss": 0.8023, "num_input_tokens_seen": 19047216, "step": 33025 }, { "epoch": 4.919571045576408, "grad_norm": 0.24161198735237122, "learning_rate": 3.003188802433261e-05, "loss": 0.8349, "num_input_tokens_seen": 19050128, "step": 33030 }, { "epoch": 4.920315758117367, "grad_norm": 0.20326925814151764, "learning_rate": 3.0025522012458336e-05, "loss": 0.8004, "num_input_tokens_seen": 19053264, "step": 33035 }, { "epoch": 4.921060470658325, "grad_norm": 0.18156594038009644, "learning_rate": 3.0019155660979713e-05, "loss": 0.7861, "num_input_tokens_seen": 19056112, "step": 33040 }, { "epoch": 4.921805183199285, "grad_norm": 0.3082994818687439, "learning_rate": 3.0012788970326967e-05, "loss": 0.8207, "num_input_tokens_seen": 19059216, "step": 33045 }, { "epoch": 4.922549895740245, "grad_norm": 0.15672683715820312, "learning_rate": 3.000642194093032e-05, "loss": 0.7932, "num_input_tokens_seen": 19062288, "step": 33050 }, { "epoch": 4.9232946082812035, "grad_norm": 0.1765609234571457, "learning_rate": 3.0000054573220028e-05, "loss": 0.7851, "num_input_tokens_seen": 19065104, "step": 33055 }, { "epoch": 4.924039320822162, "grad_norm": 0.17011477053165436, "learning_rate": 2.999368686762638e-05, "loss": 0.7864, "num_input_tokens_seen": 19068048, "step": 33060 }, { "epoch": 4.924784033363122, "grad_norm": 0.1905553787946701, "learning_rate": 2.998731882457967e-05, "loss": 0.8174, "num_input_tokens_seen": 19070672, "step": 33065 }, { "epoch": 4.9255287459040815, "grad_norm": 0.21879184246063232, "learning_rate": 2.9980950444510236e-05, "loss": 0.7823, "num_input_tokens_seen": 19073808, "step": 33070 }, { "epoch": 4.92627345844504, "grad_norm": 0.26169607043266296, "learning_rate": 2.9974581727848423e-05, "loss": 0.8058, "num_input_tokens_seen": 19076720, "step": 33075 }, { "epoch": 4.927018170985999, "grad_norm": 0.22590263187885284, "learning_rate": 2.9968212675024603e-05, "loss": 0.8079, "num_input_tokens_seen": 19079536, "step": 33080 }, { "epoch": 4.927762883526959, "grad_norm": 0.1927027702331543, "learning_rate": 2.9961843286469164e-05, "loss": 0.7904, "num_input_tokens_seen": 19082416, "step": 33085 }, { "epoch": 4.928507596067917, "grad_norm": 0.15027651190757751, "learning_rate": 2.9955473562612535e-05, "loss": 0.7766, "num_input_tokens_seen": 19085360, "step": 33090 }, { "epoch": 4.929252308608877, "grad_norm": 0.25876864790916443, "learning_rate": 2.994910350388515e-05, "loss": 0.8268, "num_input_tokens_seen": 19088080, "step": 33095 }, { "epoch": 4.929997021149836, "grad_norm": 0.3509000539779663, "learning_rate": 2.994273311071747e-05, "loss": 0.8218, "num_input_tokens_seen": 19090736, "step": 33100 }, { "epoch": 4.9307417336907955, "grad_norm": 0.20697665214538574, "learning_rate": 2.9936362383539974e-05, "loss": 0.7968, "num_input_tokens_seen": 19093616, "step": 33105 }, { "epoch": 4.931486446231754, "grad_norm": 0.2733590304851532, "learning_rate": 2.992999132278319e-05, "loss": 0.8002, "num_input_tokens_seen": 19096368, "step": 33110 }, { "epoch": 4.932231158772714, "grad_norm": 0.19779141247272491, "learning_rate": 2.9923619928877632e-05, "loss": 0.7796, "num_input_tokens_seen": 19099280, "step": 33115 }, { "epoch": 4.932975871313673, "grad_norm": 0.33118870854377747, "learning_rate": 2.9917248202253856e-05, "loss": 0.8194, "num_input_tokens_seen": 19101936, "step": 33120 }, { "epoch": 4.933720583854632, "grad_norm": 0.19271007180213928, "learning_rate": 2.9910876143342443e-05, "loss": 0.7822, "num_input_tokens_seen": 19104592, "step": 33125 }, { "epoch": 4.934465296395591, "grad_norm": 0.2508475184440613, "learning_rate": 2.9904503752573987e-05, "loss": 0.8246, "num_input_tokens_seen": 19107376, "step": 33130 }, { "epoch": 4.935210008936551, "grad_norm": 0.26137059926986694, "learning_rate": 2.98981310303791e-05, "loss": 0.7941, "num_input_tokens_seen": 19110384, "step": 33135 }, { "epoch": 4.9359547214775095, "grad_norm": 0.17493046820163727, "learning_rate": 2.9891757977188433e-05, "loss": 0.7901, "num_input_tokens_seen": 19113168, "step": 33140 }, { "epoch": 4.936699434018469, "grad_norm": 0.25274935364723206, "learning_rate": 2.9885384593432658e-05, "loss": 0.815, "num_input_tokens_seen": 19115888, "step": 33145 }, { "epoch": 4.937444146559428, "grad_norm": 0.2009219527244568, "learning_rate": 2.987901087954245e-05, "loss": 0.8189, "num_input_tokens_seen": 19118800, "step": 33150 }, { "epoch": 4.9381888591003875, "grad_norm": 0.2662760019302368, "learning_rate": 2.987263683594852e-05, "loss": 0.7898, "num_input_tokens_seen": 19121712, "step": 33155 }, { "epoch": 4.938933571641346, "grad_norm": 0.2404397875070572, "learning_rate": 2.986626246308161e-05, "loss": 0.8063, "num_input_tokens_seen": 19124592, "step": 33160 }, { "epoch": 4.939678284182306, "grad_norm": 0.24068504571914673, "learning_rate": 2.9859887761372464e-05, "loss": 0.8264, "num_input_tokens_seen": 19127728, "step": 33165 }, { "epoch": 4.940422996723265, "grad_norm": 0.2578381597995758, "learning_rate": 2.9853512731251866e-05, "loss": 0.8327, "num_input_tokens_seen": 19130640, "step": 33170 }, { "epoch": 4.941167709264224, "grad_norm": 0.30214923620224, "learning_rate": 2.9847137373150602e-05, "loss": 0.7939, "num_input_tokens_seen": 19133744, "step": 33175 }, { "epoch": 4.941912421805183, "grad_norm": 0.2935713231563568, "learning_rate": 2.9840761687499507e-05, "loss": 0.7947, "num_input_tokens_seen": 19136368, "step": 33180 }, { "epoch": 4.942657134346143, "grad_norm": 0.2622408866882324, "learning_rate": 2.9834385674729416e-05, "loss": 0.7897, "num_input_tokens_seen": 19139120, "step": 33185 }, { "epoch": 4.9434018468871015, "grad_norm": 0.20802199840545654, "learning_rate": 2.98280093352712e-05, "loss": 0.7852, "num_input_tokens_seen": 19141936, "step": 33190 }, { "epoch": 4.944146559428061, "grad_norm": 0.2943110167980194, "learning_rate": 2.9821632669555743e-05, "loss": 0.8092, "num_input_tokens_seen": 19145104, "step": 33195 }, { "epoch": 4.94489127196902, "grad_norm": 0.3409102261066437, "learning_rate": 2.981525567801395e-05, "loss": 0.8144, "num_input_tokens_seen": 19148048, "step": 33200 }, { "epoch": 4.945635984509979, "grad_norm": 0.27052485942840576, "learning_rate": 2.9808878361076754e-05, "loss": 0.8328, "num_input_tokens_seen": 19150928, "step": 33205 }, { "epoch": 4.946380697050938, "grad_norm": 0.21451576054096222, "learning_rate": 2.9802500719175107e-05, "loss": 0.8065, "num_input_tokens_seen": 19153520, "step": 33210 }, { "epoch": 4.947125409591898, "grad_norm": 0.19500499963760376, "learning_rate": 2.9796122752739997e-05, "loss": 0.7862, "num_input_tokens_seen": 19156272, "step": 33215 }, { "epoch": 4.947870122132857, "grad_norm": 0.3052375912666321, "learning_rate": 2.9789744462202407e-05, "loss": 0.8006, "num_input_tokens_seen": 19158896, "step": 33220 }, { "epoch": 4.9486148346738155, "grad_norm": 0.19319263100624084, "learning_rate": 2.9783365847993362e-05, "loss": 0.7891, "num_input_tokens_seen": 19161744, "step": 33225 }, { "epoch": 4.949359547214775, "grad_norm": 0.16459517180919647, "learning_rate": 2.9776986910543896e-05, "loss": 0.7815, "num_input_tokens_seen": 19164624, "step": 33230 }, { "epoch": 4.950104259755735, "grad_norm": 0.1811182200908661, "learning_rate": 2.9770607650285074e-05, "loss": 0.8192, "num_input_tokens_seen": 19167792, "step": 33235 }, { "epoch": 4.9508489722966935, "grad_norm": 0.2114885449409485, "learning_rate": 2.9764228067647987e-05, "loss": 0.7986, "num_input_tokens_seen": 19170736, "step": 33240 }, { "epoch": 4.951593684837652, "grad_norm": 0.20619817078113556, "learning_rate": 2.975784816306374e-05, "loss": 0.8146, "num_input_tokens_seen": 19173456, "step": 33245 }, { "epoch": 4.952338397378612, "grad_norm": 0.22193270921707153, "learning_rate": 2.9751467936963456e-05, "loss": 0.7866, "num_input_tokens_seen": 19176368, "step": 33250 }, { "epoch": 4.953083109919571, "grad_norm": 0.22144678235054016, "learning_rate": 2.9745087389778286e-05, "loss": 0.7905, "num_input_tokens_seen": 19179280, "step": 33255 }, { "epoch": 4.95382782246053, "grad_norm": 0.300178587436676, "learning_rate": 2.9738706521939402e-05, "loss": 0.8034, "num_input_tokens_seen": 19182480, "step": 33260 }, { "epoch": 4.954572535001489, "grad_norm": 0.20767724514007568, "learning_rate": 2.9732325333877997e-05, "loss": 0.834, "num_input_tokens_seen": 19185712, "step": 33265 }, { "epoch": 4.955317247542449, "grad_norm": 0.20026126503944397, "learning_rate": 2.9725943826025287e-05, "loss": 0.8093, "num_input_tokens_seen": 19188464, "step": 33270 }, { "epoch": 4.9560619600834075, "grad_norm": 0.1837531179189682, "learning_rate": 2.9719561998812506e-05, "loss": 0.7815, "num_input_tokens_seen": 19191152, "step": 33275 }, { "epoch": 4.956806672624367, "grad_norm": 0.247735857963562, "learning_rate": 2.971317985267092e-05, "loss": 0.8345, "num_input_tokens_seen": 19194480, "step": 33280 }, { "epoch": 4.957551385165326, "grad_norm": 0.23514355719089508, "learning_rate": 2.9706797388031794e-05, "loss": 0.8211, "num_input_tokens_seen": 19197264, "step": 33285 }, { "epoch": 4.9582960977062855, "grad_norm": 0.21693669259548187, "learning_rate": 2.9700414605326444e-05, "loss": 0.8034, "num_input_tokens_seen": 19200144, "step": 33290 }, { "epoch": 4.959040810247244, "grad_norm": 0.3412834405899048, "learning_rate": 2.969403150498618e-05, "loss": 0.8046, "num_input_tokens_seen": 19203376, "step": 33295 }, { "epoch": 4.959785522788204, "grad_norm": 0.19331622123718262, "learning_rate": 2.9687648087442353e-05, "loss": 0.7991, "num_input_tokens_seen": 19206096, "step": 33300 }, { "epoch": 4.960530235329163, "grad_norm": 0.22452585399150848, "learning_rate": 2.968126435312632e-05, "loss": 0.785, "num_input_tokens_seen": 19208784, "step": 33305 }, { "epoch": 4.961274947870122, "grad_norm": 0.20623233914375305, "learning_rate": 2.9674880302469487e-05, "loss": 0.8169, "num_input_tokens_seen": 19211472, "step": 33310 }, { "epoch": 4.962019660411081, "grad_norm": 0.2276846170425415, "learning_rate": 2.9668495935903246e-05, "loss": 0.8005, "num_input_tokens_seen": 19214160, "step": 33315 }, { "epoch": 4.962764372952041, "grad_norm": 0.19992491602897644, "learning_rate": 2.9662111253859025e-05, "loss": 0.8062, "num_input_tokens_seen": 19217008, "step": 33320 }, { "epoch": 4.9635090854929995, "grad_norm": 0.2656519114971161, "learning_rate": 2.9655726256768286e-05, "loss": 0.8077, "num_input_tokens_seen": 19220048, "step": 33325 }, { "epoch": 4.964253798033959, "grad_norm": 0.2238541692495346, "learning_rate": 2.96493409450625e-05, "loss": 0.7947, "num_input_tokens_seen": 19222736, "step": 33330 }, { "epoch": 4.964998510574918, "grad_norm": 0.17216457426548004, "learning_rate": 2.9642955319173142e-05, "loss": 0.8121, "num_input_tokens_seen": 19225264, "step": 33335 }, { "epoch": 4.965743223115878, "grad_norm": 0.17967204749584198, "learning_rate": 2.963656937953175e-05, "loss": 0.8095, "num_input_tokens_seen": 19228112, "step": 33340 }, { "epoch": 4.966487935656836, "grad_norm": 0.21478214859962463, "learning_rate": 2.9630183126569843e-05, "loss": 0.7955, "num_input_tokens_seen": 19231152, "step": 33345 }, { "epoch": 4.967232648197796, "grad_norm": 0.1924809366464615, "learning_rate": 2.9623796560718997e-05, "loss": 0.8105, "num_input_tokens_seen": 19233616, "step": 33350 }, { "epoch": 4.967977360738755, "grad_norm": 0.26717257499694824, "learning_rate": 2.961740968241077e-05, "loss": 0.8369, "num_input_tokens_seen": 19236432, "step": 33355 }, { "epoch": 4.968722073279714, "grad_norm": 0.13974077999591827, "learning_rate": 2.961102249207677e-05, "loss": 0.8061, "num_input_tokens_seen": 19239344, "step": 33360 }, { "epoch": 4.969466785820673, "grad_norm": 0.21874138712882996, "learning_rate": 2.9604634990148617e-05, "loss": 0.8108, "num_input_tokens_seen": 19242448, "step": 33365 }, { "epoch": 4.970211498361633, "grad_norm": 0.22847265005111694, "learning_rate": 2.9598247177057952e-05, "loss": 0.7746, "num_input_tokens_seen": 19245168, "step": 33370 }, { "epoch": 4.9709562109025915, "grad_norm": 0.2442932426929474, "learning_rate": 2.9591859053236436e-05, "loss": 0.8105, "num_input_tokens_seen": 19248208, "step": 33375 }, { "epoch": 4.971700923443551, "grad_norm": 0.18220289051532745, "learning_rate": 2.9585470619115762e-05, "loss": 0.8144, "num_input_tokens_seen": 19251056, "step": 33380 }, { "epoch": 4.97244563598451, "grad_norm": 0.3340872526168823, "learning_rate": 2.9579081875127625e-05, "loss": 0.8046, "num_input_tokens_seen": 19253968, "step": 33385 }, { "epoch": 4.973190348525469, "grad_norm": 0.17463937401771545, "learning_rate": 2.9572692821703745e-05, "loss": 0.8083, "num_input_tokens_seen": 19256720, "step": 33390 }, { "epoch": 4.973935061066428, "grad_norm": 0.27407917380332947, "learning_rate": 2.9566303459275884e-05, "loss": 0.7951, "num_input_tokens_seen": 19259728, "step": 33395 }, { "epoch": 4.974679773607388, "grad_norm": 0.16412198543548584, "learning_rate": 2.9559913788275793e-05, "loss": 0.8124, "num_input_tokens_seen": 19262736, "step": 33400 }, { "epoch": 4.975424486148347, "grad_norm": 0.1833990514278412, "learning_rate": 2.955352380913527e-05, "loss": 0.8174, "num_input_tokens_seen": 19265488, "step": 33405 }, { "epoch": 4.9761691986893055, "grad_norm": 0.22861014306545258, "learning_rate": 2.954713352228613e-05, "loss": 0.7949, "num_input_tokens_seen": 19268080, "step": 33410 }, { "epoch": 4.976913911230265, "grad_norm": 0.24816277623176575, "learning_rate": 2.9540742928160182e-05, "loss": 0.8039, "num_input_tokens_seen": 19271536, "step": 33415 }, { "epoch": 4.977658623771224, "grad_norm": 0.21916112303733826, "learning_rate": 2.953435202718929e-05, "loss": 0.8145, "num_input_tokens_seen": 19274320, "step": 33420 }, { "epoch": 4.978403336312184, "grad_norm": 0.2334594875574112, "learning_rate": 2.9527960819805327e-05, "loss": 0.7823, "num_input_tokens_seen": 19277072, "step": 33425 }, { "epoch": 4.979148048853142, "grad_norm": 0.24889348447322845, "learning_rate": 2.9521569306440183e-05, "loss": 0.8169, "num_input_tokens_seen": 19279984, "step": 33430 }, { "epoch": 4.979892761394102, "grad_norm": 0.23597130179405212, "learning_rate": 2.9515177487525763e-05, "loss": 0.84, "num_input_tokens_seen": 19282736, "step": 33435 }, { "epoch": 4.980637473935061, "grad_norm": 0.25036051869392395, "learning_rate": 2.9508785363494e-05, "loss": 0.8166, "num_input_tokens_seen": 19285552, "step": 33440 }, { "epoch": 4.98138218647602, "grad_norm": 0.31383949518203735, "learning_rate": 2.950239293477687e-05, "loss": 0.7996, "num_input_tokens_seen": 19288912, "step": 33445 }, { "epoch": 4.982126899016979, "grad_norm": 0.21559566259384155, "learning_rate": 2.949600020180632e-05, "loss": 0.8179, "num_input_tokens_seen": 19291632, "step": 33450 }, { "epoch": 4.982871611557939, "grad_norm": 0.25826412439346313, "learning_rate": 2.9489607165014353e-05, "loss": 0.795, "num_input_tokens_seen": 19294576, "step": 33455 }, { "epoch": 4.9836163240988975, "grad_norm": 0.2712852358818054, "learning_rate": 2.9483213824833e-05, "loss": 0.7917, "num_input_tokens_seen": 19297840, "step": 33460 }, { "epoch": 4.984361036639857, "grad_norm": 0.2836904227733612, "learning_rate": 2.9476820181694276e-05, "loss": 0.8258, "num_input_tokens_seen": 19300880, "step": 33465 }, { "epoch": 4.985105749180816, "grad_norm": 0.29484111070632935, "learning_rate": 2.9470426236030247e-05, "loss": 0.8138, "num_input_tokens_seen": 19304176, "step": 33470 }, { "epoch": 4.985850461721776, "grad_norm": 0.2636026442050934, "learning_rate": 2.9464031988272983e-05, "loss": 0.7926, "num_input_tokens_seen": 19307120, "step": 33475 }, { "epoch": 4.986595174262734, "grad_norm": 0.2655150592327118, "learning_rate": 2.9457637438854592e-05, "loss": 0.8026, "num_input_tokens_seen": 19310096, "step": 33480 }, { "epoch": 4.987339886803694, "grad_norm": 0.15953302383422852, "learning_rate": 2.9451242588207185e-05, "loss": 0.8151, "num_input_tokens_seen": 19312752, "step": 33485 }, { "epoch": 4.988084599344653, "grad_norm": 0.19394145905971527, "learning_rate": 2.94448474367629e-05, "loss": 0.8042, "num_input_tokens_seen": 19315312, "step": 33490 }, { "epoch": 4.988829311885612, "grad_norm": 0.1905316859483719, "learning_rate": 2.94384519849539e-05, "loss": 0.7955, "num_input_tokens_seen": 19318032, "step": 33495 }, { "epoch": 4.989574024426571, "grad_norm": 0.19210605323314667, "learning_rate": 2.9432056233212357e-05, "loss": 0.8122, "num_input_tokens_seen": 19320880, "step": 33500 }, { "epoch": 4.990318736967531, "grad_norm": 0.2436237335205078, "learning_rate": 2.9425660181970472e-05, "loss": 0.8322, "num_input_tokens_seen": 19323920, "step": 33505 }, { "epoch": 4.99106344950849, "grad_norm": 0.24777451157569885, "learning_rate": 2.9419263831660475e-05, "loss": 0.7975, "num_input_tokens_seen": 19326864, "step": 33510 }, { "epoch": 4.991808162049449, "grad_norm": 0.24041683971881866, "learning_rate": 2.941286718271459e-05, "loss": 0.8212, "num_input_tokens_seen": 19329872, "step": 33515 }, { "epoch": 4.992552874590408, "grad_norm": 0.21262507140636444, "learning_rate": 2.9406470235565075e-05, "loss": 0.8149, "num_input_tokens_seen": 19332688, "step": 33520 }, { "epoch": 4.993297587131368, "grad_norm": 0.1997487097978592, "learning_rate": 2.940007299064423e-05, "loss": 0.8005, "num_input_tokens_seen": 19335536, "step": 33525 }, { "epoch": 4.994042299672326, "grad_norm": 0.26443037390708923, "learning_rate": 2.9393675448384332e-05, "loss": 0.7973, "num_input_tokens_seen": 19338192, "step": 33530 }, { "epoch": 4.994787012213286, "grad_norm": 0.21424034237861633, "learning_rate": 2.9387277609217713e-05, "loss": 0.8042, "num_input_tokens_seen": 19341008, "step": 33535 }, { "epoch": 4.995531724754245, "grad_norm": 0.19855408370494843, "learning_rate": 2.9380879473576705e-05, "loss": 0.8061, "num_input_tokens_seen": 19343888, "step": 33540 }, { "epoch": 4.996276437295204, "grad_norm": 0.21439291536808014, "learning_rate": 2.9374481041893687e-05, "loss": 0.8165, "num_input_tokens_seen": 19346928, "step": 33545 }, { "epoch": 4.997021149836163, "grad_norm": 0.21262961626052856, "learning_rate": 2.9368082314601018e-05, "loss": 0.8192, "num_input_tokens_seen": 19349808, "step": 33550 }, { "epoch": 4.997765862377122, "grad_norm": 0.23498691618442535, "learning_rate": 2.9361683292131103e-05, "loss": 0.8064, "num_input_tokens_seen": 19352752, "step": 33555 }, { "epoch": 4.998510574918082, "grad_norm": 0.1604195535182953, "learning_rate": 2.935528397491637e-05, "loss": 0.8166, "num_input_tokens_seen": 19355472, "step": 33560 }, { "epoch": 4.999255287459041, "grad_norm": 0.1994994431734085, "learning_rate": 2.9348884363389246e-05, "loss": 0.7965, "num_input_tokens_seen": 19358320, "step": 33565 }, { "epoch": 5.0, "grad_norm": 0.4092390537261963, "learning_rate": 2.9342484457982206e-05, "loss": 0.813, "num_input_tokens_seen": 19360624, "step": 33570 }, { "epoch": 5.0, "eval_loss": 0.8033895492553711, "eval_runtime": 45.3906, "eval_samples_per_second": 65.741, "eval_steps_per_second": 16.435, "num_input_tokens_seen": 19360624, "step": 33570 }, { "epoch": 5.000744712540959, "grad_norm": 0.1773453950881958, "learning_rate": 2.9336084259127716e-05, "loss": 0.7847, "num_input_tokens_seen": 19363472, "step": 33575 }, { "epoch": 5.001489425081918, "grad_norm": 0.23405396938323975, "learning_rate": 2.932968376725828e-05, "loss": 0.7821, "num_input_tokens_seen": 19366288, "step": 33580 }, { "epoch": 5.002234137622877, "grad_norm": 0.2261151522397995, "learning_rate": 2.932328298280642e-05, "loss": 0.8153, "num_input_tokens_seen": 19368944, "step": 33585 }, { "epoch": 5.002978850163837, "grad_norm": 0.19925859570503235, "learning_rate": 2.9316881906204675e-05, "loss": 0.7975, "num_input_tokens_seen": 19371632, "step": 33590 }, { "epoch": 5.003723562704796, "grad_norm": 0.18629378080368042, "learning_rate": 2.9310480537885605e-05, "loss": 0.782, "num_input_tokens_seen": 19374608, "step": 33595 }, { "epoch": 5.004468275245755, "grad_norm": 0.29142919182777405, "learning_rate": 2.9304078878281778e-05, "loss": 0.8011, "num_input_tokens_seen": 19377520, "step": 33600 }, { "epoch": 5.005212987786714, "grad_norm": 0.22357061505317688, "learning_rate": 2.9297676927825803e-05, "loss": 0.7994, "num_input_tokens_seen": 19380464, "step": 33605 }, { "epoch": 5.005957700327674, "grad_norm": 0.20519888401031494, "learning_rate": 2.9291274686950294e-05, "loss": 0.8233, "num_input_tokens_seen": 19383152, "step": 33610 }, { "epoch": 5.006702412868632, "grad_norm": 0.18835300207138062, "learning_rate": 2.9284872156087896e-05, "loss": 0.8195, "num_input_tokens_seen": 19386096, "step": 33615 }, { "epoch": 5.007447125409592, "grad_norm": 0.21553891897201538, "learning_rate": 2.9278469335671245e-05, "loss": 0.7936, "num_input_tokens_seen": 19389072, "step": 33620 }, { "epoch": 5.008191837950551, "grad_norm": 0.2385515421628952, "learning_rate": 2.9272066226133037e-05, "loss": 0.7812, "num_input_tokens_seen": 19391792, "step": 33625 }, { "epoch": 5.00893655049151, "grad_norm": 0.20083168148994446, "learning_rate": 2.9265662827905967e-05, "loss": 0.8052, "num_input_tokens_seen": 19394608, "step": 33630 }, { "epoch": 5.009681263032469, "grad_norm": 0.2151610255241394, "learning_rate": 2.925925914142274e-05, "loss": 0.8232, "num_input_tokens_seen": 19397232, "step": 33635 }, { "epoch": 5.010425975573429, "grad_norm": 0.21434815227985382, "learning_rate": 2.92528551671161e-05, "loss": 0.7934, "num_input_tokens_seen": 19399888, "step": 33640 }, { "epoch": 5.011170688114388, "grad_norm": 0.34347352385520935, "learning_rate": 2.9246450905418798e-05, "loss": 0.832, "num_input_tokens_seen": 19403152, "step": 33645 }, { "epoch": 5.011915400655347, "grad_norm": 0.2461152821779251, "learning_rate": 2.9240046356763607e-05, "loss": 0.7868, "num_input_tokens_seen": 19405744, "step": 33650 }, { "epoch": 5.012660113196306, "grad_norm": 0.2007921189069748, "learning_rate": 2.9233641521583325e-05, "loss": 0.8167, "num_input_tokens_seen": 19408304, "step": 33655 }, { "epoch": 5.013404825737266, "grad_norm": 0.2372947633266449, "learning_rate": 2.9227236400310765e-05, "loss": 0.8185, "num_input_tokens_seen": 19411504, "step": 33660 }, { "epoch": 5.014149538278224, "grad_norm": 0.2657589912414551, "learning_rate": 2.9220830993378745e-05, "loss": 0.7816, "num_input_tokens_seen": 19414448, "step": 33665 }, { "epoch": 5.014894250819184, "grad_norm": 0.2416193038225174, "learning_rate": 2.9214425301220133e-05, "loss": 0.794, "num_input_tokens_seen": 19417360, "step": 33670 }, { "epoch": 5.015638963360143, "grad_norm": 0.19475209712982178, "learning_rate": 2.9208019324267798e-05, "loss": 0.7956, "num_input_tokens_seen": 19420080, "step": 33675 }, { "epoch": 5.0163836759011025, "grad_norm": 0.2511215806007385, "learning_rate": 2.920161306295462e-05, "loss": 0.7944, "num_input_tokens_seen": 19423184, "step": 33680 }, { "epoch": 5.017128388442061, "grad_norm": 0.2131212055683136, "learning_rate": 2.9195206517713515e-05, "loss": 0.7949, "num_input_tokens_seen": 19425776, "step": 33685 }, { "epoch": 5.017873100983021, "grad_norm": 0.1944449245929718, "learning_rate": 2.9188799688977407e-05, "loss": 0.8066, "num_input_tokens_seen": 19428816, "step": 33690 }, { "epoch": 5.01861781352398, "grad_norm": 0.2019759863615036, "learning_rate": 2.9182392577179257e-05, "loss": 0.801, "num_input_tokens_seen": 19431888, "step": 33695 }, { "epoch": 5.019362526064939, "grad_norm": 0.23812314867973328, "learning_rate": 2.917598518275201e-05, "loss": 0.8097, "num_input_tokens_seen": 19434928, "step": 33700 }, { "epoch": 5.020107238605898, "grad_norm": 0.21240538358688354, "learning_rate": 2.9169577506128664e-05, "loss": 0.7968, "num_input_tokens_seen": 19437904, "step": 33705 }, { "epoch": 5.020851951146858, "grad_norm": 0.1832251250743866, "learning_rate": 2.9163169547742225e-05, "loss": 0.7708, "num_input_tokens_seen": 19440976, "step": 33710 }, { "epoch": 5.021596663687816, "grad_norm": 0.2184571772813797, "learning_rate": 2.9156761308025715e-05, "loss": 0.7774, "num_input_tokens_seen": 19443568, "step": 33715 }, { "epoch": 5.022341376228776, "grad_norm": 0.2793559730052948, "learning_rate": 2.915035278741218e-05, "loss": 0.806, "num_input_tokens_seen": 19446256, "step": 33720 }, { "epoch": 5.023086088769735, "grad_norm": 0.29081928730010986, "learning_rate": 2.914394398633467e-05, "loss": 0.8231, "num_input_tokens_seen": 19449040, "step": 33725 }, { "epoch": 5.0238308013106945, "grad_norm": 0.23924866318702698, "learning_rate": 2.9137534905226272e-05, "loss": 0.7734, "num_input_tokens_seen": 19452400, "step": 33730 }, { "epoch": 5.024575513851653, "grad_norm": 0.2033226490020752, "learning_rate": 2.9131125544520095e-05, "loss": 0.7929, "num_input_tokens_seen": 19455120, "step": 33735 }, { "epoch": 5.025320226392613, "grad_norm": 0.2273845374584198, "learning_rate": 2.9124715904649247e-05, "loss": 0.81, "num_input_tokens_seen": 19458128, "step": 33740 }, { "epoch": 5.026064938933572, "grad_norm": 0.18432766199111938, "learning_rate": 2.911830598604687e-05, "loss": 0.8469, "num_input_tokens_seen": 19460944, "step": 33745 }, { "epoch": 5.02680965147453, "grad_norm": 0.16542194783687592, "learning_rate": 2.911189578914611e-05, "loss": 0.7977, "num_input_tokens_seen": 19463504, "step": 33750 }, { "epoch": 5.02755436401549, "grad_norm": 0.20139677822589874, "learning_rate": 2.9105485314380154e-05, "loss": 0.8034, "num_input_tokens_seen": 19466512, "step": 33755 }, { "epoch": 5.028299076556449, "grad_norm": 0.1742122918367386, "learning_rate": 2.90990745621822e-05, "loss": 0.8055, "num_input_tokens_seen": 19469168, "step": 33760 }, { "epoch": 5.0290437890974085, "grad_norm": 0.21908102929592133, "learning_rate": 2.9092663532985442e-05, "loss": 0.831, "num_input_tokens_seen": 19472080, "step": 33765 }, { "epoch": 5.029788501638367, "grad_norm": 0.3066713213920593, "learning_rate": 2.9086252227223122e-05, "loss": 0.7754, "num_input_tokens_seen": 19474832, "step": 33770 }, { "epoch": 5.030533214179327, "grad_norm": 0.20616155862808228, "learning_rate": 2.9079840645328505e-05, "loss": 0.768, "num_input_tokens_seen": 19477584, "step": 33775 }, { "epoch": 5.031277926720286, "grad_norm": 0.22166074812412262, "learning_rate": 2.907342878773483e-05, "loss": 0.7998, "num_input_tokens_seen": 19480304, "step": 33780 }, { "epoch": 5.032022639261245, "grad_norm": 0.21879997849464417, "learning_rate": 2.90670166548754e-05, "loss": 0.7961, "num_input_tokens_seen": 19483248, "step": 33785 }, { "epoch": 5.032767351802204, "grad_norm": 0.265679270029068, "learning_rate": 2.9060604247183525e-05, "loss": 0.816, "num_input_tokens_seen": 19486192, "step": 33790 }, { "epoch": 5.033512064343164, "grad_norm": 0.2337900549173355, "learning_rate": 2.9054191565092524e-05, "loss": 0.7735, "num_input_tokens_seen": 19489072, "step": 33795 }, { "epoch": 5.034256776884122, "grad_norm": 0.20133669674396515, "learning_rate": 2.9047778609035737e-05, "loss": 0.7692, "num_input_tokens_seen": 19492112, "step": 33800 }, { "epoch": 5.035001489425082, "grad_norm": 0.20637767016887665, "learning_rate": 2.9041365379446522e-05, "loss": 0.8215, "num_input_tokens_seen": 19495280, "step": 33805 }, { "epoch": 5.035746201966041, "grad_norm": 0.2630608081817627, "learning_rate": 2.9034951876758276e-05, "loss": 0.8137, "num_input_tokens_seen": 19497872, "step": 33810 }, { "epoch": 5.0364909145070005, "grad_norm": 0.1994534730911255, "learning_rate": 2.902853810140439e-05, "loss": 0.8015, "num_input_tokens_seen": 19500848, "step": 33815 }, { "epoch": 5.037235627047959, "grad_norm": 0.18806464970111847, "learning_rate": 2.9022124053818268e-05, "loss": 0.8067, "num_input_tokens_seen": 19503728, "step": 33820 }, { "epoch": 5.037980339588919, "grad_norm": 0.24750082194805145, "learning_rate": 2.901570973443336e-05, "loss": 0.8318, "num_input_tokens_seen": 19506640, "step": 33825 }, { "epoch": 5.038725052129878, "grad_norm": 0.21560105681419373, "learning_rate": 2.9009295143683114e-05, "loss": 0.7993, "num_input_tokens_seen": 19509456, "step": 33830 }, { "epoch": 5.039469764670837, "grad_norm": 0.20854471623897552, "learning_rate": 2.9002880282001004e-05, "loss": 0.8227, "num_input_tokens_seen": 19512400, "step": 33835 }, { "epoch": 5.040214477211796, "grad_norm": 0.2240348905324936, "learning_rate": 2.899646514982052e-05, "loss": 0.7746, "num_input_tokens_seen": 19515472, "step": 33840 }, { "epoch": 5.040959189752756, "grad_norm": 0.19358207285404205, "learning_rate": 2.8990049747575165e-05, "loss": 0.7911, "num_input_tokens_seen": 19518256, "step": 33845 }, { "epoch": 5.0417039022937145, "grad_norm": 0.2384873330593109, "learning_rate": 2.8983634075698475e-05, "loss": 0.8374, "num_input_tokens_seen": 19520848, "step": 33850 }, { "epoch": 5.042448614834674, "grad_norm": 0.17531414330005646, "learning_rate": 2.897721813462399e-05, "loss": 0.8073, "num_input_tokens_seen": 19523728, "step": 33855 }, { "epoch": 5.043193327375633, "grad_norm": 0.2400289922952652, "learning_rate": 2.8970801924785273e-05, "loss": 0.8119, "num_input_tokens_seen": 19526736, "step": 33860 }, { "epoch": 5.0439380399165925, "grad_norm": 0.18657012283802032, "learning_rate": 2.8964385446615905e-05, "loss": 0.821, "num_input_tokens_seen": 19529744, "step": 33865 }, { "epoch": 5.044682752457551, "grad_norm": 0.21708042919635773, "learning_rate": 2.895796870054948e-05, "loss": 0.8106, "num_input_tokens_seen": 19532624, "step": 33870 }, { "epoch": 5.045427464998511, "grad_norm": 0.2350788116455078, "learning_rate": 2.895155168701964e-05, "loss": 0.8089, "num_input_tokens_seen": 19535504, "step": 33875 }, { "epoch": 5.04617217753947, "grad_norm": 0.2008868306875229, "learning_rate": 2.894513440645999e-05, "loss": 0.8029, "num_input_tokens_seen": 19538320, "step": 33880 }, { "epoch": 5.046916890080429, "grad_norm": 0.14978985488414764, "learning_rate": 2.8938716859304193e-05, "loss": 0.7746, "num_input_tokens_seen": 19540912, "step": 33885 }, { "epoch": 5.047661602621388, "grad_norm": 0.23464341461658478, "learning_rate": 2.8932299045985932e-05, "loss": 0.8039, "num_input_tokens_seen": 19543664, "step": 33890 }, { "epoch": 5.048406315162348, "grad_norm": 0.2630387246608734, "learning_rate": 2.892588096693889e-05, "loss": 0.816, "num_input_tokens_seen": 19546384, "step": 33895 }, { "epoch": 5.0491510277033065, "grad_norm": 0.20174016058444977, "learning_rate": 2.8919462622596764e-05, "loss": 0.8006, "num_input_tokens_seen": 19549264, "step": 33900 }, { "epoch": 5.049895740244266, "grad_norm": 0.2335624098777771, "learning_rate": 2.8913044013393305e-05, "loss": 0.8361, "num_input_tokens_seen": 19552176, "step": 33905 }, { "epoch": 5.050640452785225, "grad_norm": 0.2185317724943161, "learning_rate": 2.890662513976223e-05, "loss": 0.7995, "num_input_tokens_seen": 19555024, "step": 33910 }, { "epoch": 5.0513851653261845, "grad_norm": 0.20564495027065277, "learning_rate": 2.890020600213731e-05, "loss": 0.8129, "num_input_tokens_seen": 19557936, "step": 33915 }, { "epoch": 5.052129877867143, "grad_norm": 0.19732442498207092, "learning_rate": 2.889378660095233e-05, "loss": 0.833, "num_input_tokens_seen": 19561040, "step": 33920 }, { "epoch": 5.052874590408102, "grad_norm": 0.3092939853668213, "learning_rate": 2.8887366936641082e-05, "loss": 0.8009, "num_input_tokens_seen": 19564208, "step": 33925 }, { "epoch": 5.053619302949062, "grad_norm": 0.2904396653175354, "learning_rate": 2.8880947009637377e-05, "loss": 0.7704, "num_input_tokens_seen": 19567184, "step": 33930 }, { "epoch": 5.0543640154900205, "grad_norm": 0.17109724879264832, "learning_rate": 2.887452682037506e-05, "loss": 0.8171, "num_input_tokens_seen": 19569968, "step": 33935 }, { "epoch": 5.05510872803098, "grad_norm": 0.18357035517692566, "learning_rate": 2.8868106369287966e-05, "loss": 0.7819, "num_input_tokens_seen": 19572912, "step": 33940 }, { "epoch": 5.055853440571939, "grad_norm": 0.25868648290634155, "learning_rate": 2.886168565680997e-05, "loss": 0.8324, "num_input_tokens_seen": 19575856, "step": 33945 }, { "epoch": 5.0565981531128985, "grad_norm": 0.21232843399047852, "learning_rate": 2.8855264683374956e-05, "loss": 0.7856, "num_input_tokens_seen": 19579216, "step": 33950 }, { "epoch": 5.057342865653857, "grad_norm": 0.2225022315979004, "learning_rate": 2.884884344941684e-05, "loss": 0.7826, "num_input_tokens_seen": 19582096, "step": 33955 }, { "epoch": 5.058087578194817, "grad_norm": 0.23898983001708984, "learning_rate": 2.8842421955369526e-05, "loss": 0.8216, "num_input_tokens_seen": 19585104, "step": 33960 }, { "epoch": 5.058832290735776, "grad_norm": 0.21483206748962402, "learning_rate": 2.883600020166695e-05, "loss": 0.8132, "num_input_tokens_seen": 19588144, "step": 33965 }, { "epoch": 5.059577003276735, "grad_norm": 0.21512769162654877, "learning_rate": 2.8829578188743084e-05, "loss": 0.778, "num_input_tokens_seen": 19590928, "step": 33970 }, { "epoch": 5.060321715817694, "grad_norm": 0.20295949280261993, "learning_rate": 2.88231559170319e-05, "loss": 0.7896, "num_input_tokens_seen": 19593968, "step": 33975 }, { "epoch": 5.061066428358654, "grad_norm": 0.3014381229877472, "learning_rate": 2.8816733386967376e-05, "loss": 0.8248, "num_input_tokens_seen": 19596880, "step": 33980 }, { "epoch": 5.0618111408996125, "grad_norm": 0.33806493878364563, "learning_rate": 2.8810310598983524e-05, "loss": 0.8597, "num_input_tokens_seen": 19599664, "step": 33985 }, { "epoch": 5.062555853440572, "grad_norm": 0.19965942203998566, "learning_rate": 2.880388755351438e-05, "loss": 0.8073, "num_input_tokens_seen": 19602384, "step": 33990 }, { "epoch": 5.063300565981531, "grad_norm": 0.18187765777111053, "learning_rate": 2.8797464250993984e-05, "loss": 0.7839, "num_input_tokens_seen": 19605168, "step": 33995 }, { "epoch": 5.0640452785224905, "grad_norm": 0.33941760659217834, "learning_rate": 2.8791040691856385e-05, "loss": 0.7864, "num_input_tokens_seen": 19608176, "step": 34000 }, { "epoch": 5.064789991063449, "grad_norm": 0.20719735324382782, "learning_rate": 2.8784616876535673e-05, "loss": 0.7938, "num_input_tokens_seen": 19611472, "step": 34005 }, { "epoch": 5.065534703604409, "grad_norm": 0.2522352337837219, "learning_rate": 2.8778192805465937e-05, "loss": 0.768, "num_input_tokens_seen": 19614384, "step": 34010 }, { "epoch": 5.066279416145368, "grad_norm": 0.2074739634990692, "learning_rate": 2.8771768479081297e-05, "loss": 0.8242, "num_input_tokens_seen": 19617104, "step": 34015 }, { "epoch": 5.067024128686327, "grad_norm": 0.2749664783477783, "learning_rate": 2.8765343897815867e-05, "loss": 0.839, "num_input_tokens_seen": 19619792, "step": 34020 }, { "epoch": 5.067768841227286, "grad_norm": 0.20572781562805176, "learning_rate": 2.8758919062103817e-05, "loss": 0.7983, "num_input_tokens_seen": 19622448, "step": 34025 }, { "epoch": 5.068513553768246, "grad_norm": 0.20543545484542847, "learning_rate": 2.8752493972379292e-05, "loss": 0.8196, "num_input_tokens_seen": 19625200, "step": 34030 }, { "epoch": 5.0692582663092045, "grad_norm": 0.15826387703418732, "learning_rate": 2.8746068629076487e-05, "loss": 0.7893, "num_input_tokens_seen": 19627920, "step": 34035 }, { "epoch": 5.070002978850164, "grad_norm": 0.34705665707588196, "learning_rate": 2.8739643032629592e-05, "loss": 0.8334, "num_input_tokens_seen": 19630928, "step": 34040 }, { "epoch": 5.070747691391123, "grad_norm": 0.2613135576248169, "learning_rate": 2.8733217183472823e-05, "loss": 0.821, "num_input_tokens_seen": 19633744, "step": 34045 }, { "epoch": 5.071492403932083, "grad_norm": 0.2793966233730316, "learning_rate": 2.8726791082040416e-05, "loss": 0.7901, "num_input_tokens_seen": 19636816, "step": 34050 }, { "epoch": 5.072237116473041, "grad_norm": 0.1589083969593048, "learning_rate": 2.8720364728766618e-05, "loss": 0.8058, "num_input_tokens_seen": 19639536, "step": 34055 }, { "epoch": 5.072981829014001, "grad_norm": 0.22134874761104584, "learning_rate": 2.8713938124085706e-05, "loss": 0.806, "num_input_tokens_seen": 19642256, "step": 34060 }, { "epoch": 5.07372654155496, "grad_norm": 0.22076961398124695, "learning_rate": 2.8707511268431947e-05, "loss": 0.8016, "num_input_tokens_seen": 19645168, "step": 34065 }, { "epoch": 5.074471254095919, "grad_norm": 0.2563554644584656, "learning_rate": 2.8701084162239656e-05, "loss": 0.817, "num_input_tokens_seen": 19647920, "step": 34070 }, { "epoch": 5.075215966636878, "grad_norm": 0.22137261927127838, "learning_rate": 2.8694656805943143e-05, "loss": 0.7999, "num_input_tokens_seen": 19650480, "step": 34075 }, { "epoch": 5.075960679177838, "grad_norm": 0.18410877883434296, "learning_rate": 2.868822919997674e-05, "loss": 0.7937, "num_input_tokens_seen": 19653488, "step": 34080 }, { "epoch": 5.0767053917187965, "grad_norm": 0.19777938723564148, "learning_rate": 2.86818013447748e-05, "loss": 0.7976, "num_input_tokens_seen": 19656272, "step": 34085 }, { "epoch": 5.077450104259755, "grad_norm": 0.19556213915348053, "learning_rate": 2.8675373240771703e-05, "loss": 0.7783, "num_input_tokens_seen": 19658960, "step": 34090 }, { "epoch": 5.078194816800715, "grad_norm": 0.261955201625824, "learning_rate": 2.8668944888401826e-05, "loss": 0.7758, "num_input_tokens_seen": 19661680, "step": 34095 }, { "epoch": 5.078939529341674, "grad_norm": 0.24059687554836273, "learning_rate": 2.866251628809956e-05, "loss": 0.8088, "num_input_tokens_seen": 19664528, "step": 34100 }, { "epoch": 5.079684241882633, "grad_norm": 0.295947790145874, "learning_rate": 2.8656087440299347e-05, "loss": 0.8081, "num_input_tokens_seen": 19667344, "step": 34105 }, { "epoch": 5.080428954423592, "grad_norm": 0.19413891434669495, "learning_rate": 2.8649658345435597e-05, "loss": 0.8514, "num_input_tokens_seen": 19670224, "step": 34110 }, { "epoch": 5.081173666964552, "grad_norm": 0.2044314295053482, "learning_rate": 2.8643229003942786e-05, "loss": 0.7739, "num_input_tokens_seen": 19673168, "step": 34115 }, { "epoch": 5.0819183795055105, "grad_norm": 0.1808026134967804, "learning_rate": 2.8636799416255362e-05, "loss": 0.796, "num_input_tokens_seen": 19675856, "step": 34120 }, { "epoch": 5.08266309204647, "grad_norm": 0.25036871433258057, "learning_rate": 2.8630369582807824e-05, "loss": 0.82, "num_input_tokens_seen": 19678512, "step": 34125 }, { "epoch": 5.083407804587429, "grad_norm": 0.24698005616664886, "learning_rate": 2.8623939504034662e-05, "loss": 0.8196, "num_input_tokens_seen": 19681360, "step": 34130 }, { "epoch": 5.084152517128389, "grad_norm": 0.17702296376228333, "learning_rate": 2.8617509180370418e-05, "loss": 0.79, "num_input_tokens_seen": 19684272, "step": 34135 }, { "epoch": 5.084897229669347, "grad_norm": 0.21178333461284637, "learning_rate": 2.8611078612249598e-05, "loss": 0.7843, "num_input_tokens_seen": 19686928, "step": 34140 }, { "epoch": 5.085641942210307, "grad_norm": 0.18655624985694885, "learning_rate": 2.8604647800106772e-05, "loss": 0.7882, "num_input_tokens_seen": 19689776, "step": 34145 }, { "epoch": 5.086386654751266, "grad_norm": 0.23197294771671295, "learning_rate": 2.85982167443765e-05, "loss": 0.8595, "num_input_tokens_seen": 19692496, "step": 34150 }, { "epoch": 5.087131367292225, "grad_norm": 0.24440690875053406, "learning_rate": 2.8591785445493376e-05, "loss": 0.7965, "num_input_tokens_seen": 19695472, "step": 34155 }, { "epoch": 5.087876079833184, "grad_norm": 0.25634992122650146, "learning_rate": 2.8585353903891986e-05, "loss": 0.8008, "num_input_tokens_seen": 19698416, "step": 34160 }, { "epoch": 5.088620792374144, "grad_norm": 0.25643694400787354, "learning_rate": 2.8578922120006962e-05, "loss": 0.8003, "num_input_tokens_seen": 19701424, "step": 34165 }, { "epoch": 5.0893655049151025, "grad_norm": 0.2494710385799408, "learning_rate": 2.857249009427293e-05, "loss": 0.8053, "num_input_tokens_seen": 19704432, "step": 34170 }, { "epoch": 5.090110217456062, "grad_norm": 0.1969875544309616, "learning_rate": 2.856605782712455e-05, "loss": 0.7961, "num_input_tokens_seen": 19707216, "step": 34175 }, { "epoch": 5.090854929997021, "grad_norm": 0.22906352579593658, "learning_rate": 2.855962531899647e-05, "loss": 0.8209, "num_input_tokens_seen": 19710032, "step": 34180 }, { "epoch": 5.091599642537981, "grad_norm": 0.24664902687072754, "learning_rate": 2.8553192570323385e-05, "loss": 0.8045, "num_input_tokens_seen": 19712912, "step": 34185 }, { "epoch": 5.092344355078939, "grad_norm": 0.24626323580741882, "learning_rate": 2.8546759581539994e-05, "loss": 0.806, "num_input_tokens_seen": 19716176, "step": 34190 }, { "epoch": 5.093089067619899, "grad_norm": 0.21729633212089539, "learning_rate": 2.8540326353081005e-05, "loss": 0.8133, "num_input_tokens_seen": 19719120, "step": 34195 }, { "epoch": 5.093833780160858, "grad_norm": 0.2624443769454956, "learning_rate": 2.8533892885381164e-05, "loss": 0.8007, "num_input_tokens_seen": 19721904, "step": 34200 }, { "epoch": 5.094578492701817, "grad_norm": 0.30383211374282837, "learning_rate": 2.852745917887521e-05, "loss": 0.8092, "num_input_tokens_seen": 19724720, "step": 34205 }, { "epoch": 5.095323205242776, "grad_norm": 0.2001200020313263, "learning_rate": 2.85210252339979e-05, "loss": 0.7798, "num_input_tokens_seen": 19727568, "step": 34210 }, { "epoch": 5.096067917783736, "grad_norm": 0.2205793857574463, "learning_rate": 2.851459105118402e-05, "loss": 0.795, "num_input_tokens_seen": 19730704, "step": 34215 }, { "epoch": 5.096812630324695, "grad_norm": 0.1530483067035675, "learning_rate": 2.8508156630868373e-05, "loss": 0.7945, "num_input_tokens_seen": 19733584, "step": 34220 }, { "epoch": 5.097557342865654, "grad_norm": 0.21520057320594788, "learning_rate": 2.8501721973485757e-05, "loss": 0.7809, "num_input_tokens_seen": 19736304, "step": 34225 }, { "epoch": 5.098302055406613, "grad_norm": 0.266570121049881, "learning_rate": 2.8495287079471012e-05, "loss": 0.7875, "num_input_tokens_seen": 19739440, "step": 34230 }, { "epoch": 5.099046767947573, "grad_norm": 0.20099513232707977, "learning_rate": 2.8488851949258972e-05, "loss": 0.793, "num_input_tokens_seen": 19742448, "step": 34235 }, { "epoch": 5.099791480488531, "grad_norm": 0.21109086275100708, "learning_rate": 2.848241658328451e-05, "loss": 0.7787, "num_input_tokens_seen": 19745200, "step": 34240 }, { "epoch": 5.100536193029491, "grad_norm": 0.292236328125, "learning_rate": 2.8475980981982485e-05, "loss": 0.8108, "num_input_tokens_seen": 19748528, "step": 34245 }, { "epoch": 5.10128090557045, "grad_norm": 0.23606431484222412, "learning_rate": 2.8469545145787796e-05, "loss": 0.8178, "num_input_tokens_seen": 19751248, "step": 34250 }, { "epoch": 5.102025618111409, "grad_norm": 0.210796058177948, "learning_rate": 2.846310907513536e-05, "loss": 0.8078, "num_input_tokens_seen": 19753936, "step": 34255 }, { "epoch": 5.102770330652368, "grad_norm": 0.2865477502346039, "learning_rate": 2.845667277046009e-05, "loss": 0.8165, "num_input_tokens_seen": 19756880, "step": 34260 }, { "epoch": 5.103515043193327, "grad_norm": 0.1709514707326889, "learning_rate": 2.8450236232196924e-05, "loss": 0.8296, "num_input_tokens_seen": 19759792, "step": 34265 }, { "epoch": 5.104259755734287, "grad_norm": 0.17308902740478516, "learning_rate": 2.844379946078083e-05, "loss": 0.8043, "num_input_tokens_seen": 19762896, "step": 34270 }, { "epoch": 5.105004468275245, "grad_norm": 0.22266393899917603, "learning_rate": 2.843736245664676e-05, "loss": 0.7942, "num_input_tokens_seen": 19765776, "step": 34275 }, { "epoch": 5.105749180816205, "grad_norm": 0.25160855054855347, "learning_rate": 2.843092522022972e-05, "loss": 0.8004, "num_input_tokens_seen": 19768592, "step": 34280 }, { "epoch": 5.106493893357164, "grad_norm": 0.23210972547531128, "learning_rate": 2.8424487751964696e-05, "loss": 0.8076, "num_input_tokens_seen": 19771760, "step": 34285 }, { "epoch": 5.107238605898123, "grad_norm": 0.16059710085391998, "learning_rate": 2.8418050052286715e-05, "loss": 0.8048, "num_input_tokens_seen": 19774480, "step": 34290 }, { "epoch": 5.107983318439082, "grad_norm": 0.23872722685337067, "learning_rate": 2.8411612121630804e-05, "loss": 0.804, "num_input_tokens_seen": 19777232, "step": 34295 }, { "epoch": 5.108728030980042, "grad_norm": 0.24763457477092743, "learning_rate": 2.8405173960432024e-05, "loss": 0.8184, "num_input_tokens_seen": 19779888, "step": 34300 }, { "epoch": 5.109472743521001, "grad_norm": 0.19663801789283752, "learning_rate": 2.8398735569125427e-05, "loss": 0.8003, "num_input_tokens_seen": 19782736, "step": 34305 }, { "epoch": 5.11021745606196, "grad_norm": 0.21423789858818054, "learning_rate": 2.83922969481461e-05, "loss": 0.8095, "num_input_tokens_seen": 19785552, "step": 34310 }, { "epoch": 5.110962168602919, "grad_norm": 0.25372928380966187, "learning_rate": 2.8385858097929135e-05, "loss": 0.7994, "num_input_tokens_seen": 19788336, "step": 34315 }, { "epoch": 5.111706881143879, "grad_norm": 0.2239609807729721, "learning_rate": 2.8379419018909648e-05, "loss": 0.828, "num_input_tokens_seen": 19791152, "step": 34320 }, { "epoch": 5.112451593684837, "grad_norm": 0.2452956885099411, "learning_rate": 2.8372979711522767e-05, "loss": 0.791, "num_input_tokens_seen": 19793776, "step": 34325 }, { "epoch": 5.113196306225797, "grad_norm": 0.1883571296930313, "learning_rate": 2.8366540176203625e-05, "loss": 0.7943, "num_input_tokens_seen": 19796400, "step": 34330 }, { "epoch": 5.113941018766756, "grad_norm": 0.21650512516498566, "learning_rate": 2.8360100413387392e-05, "loss": 0.8054, "num_input_tokens_seen": 19799088, "step": 34335 }, { "epoch": 5.114685731307715, "grad_norm": 0.1711876541376114, "learning_rate": 2.8353660423509233e-05, "loss": 0.8061, "num_input_tokens_seen": 19801872, "step": 34340 }, { "epoch": 5.115430443848674, "grad_norm": 0.2556573748588562, "learning_rate": 2.8347220207004325e-05, "loss": 0.7829, "num_input_tokens_seen": 19804848, "step": 34345 }, { "epoch": 5.116175156389634, "grad_norm": 0.16999244689941406, "learning_rate": 2.834077976430789e-05, "loss": 0.7939, "num_input_tokens_seen": 19807696, "step": 34350 }, { "epoch": 5.116919868930593, "grad_norm": 0.21163037419319153, "learning_rate": 2.8334339095855152e-05, "loss": 0.8097, "num_input_tokens_seen": 19810544, "step": 34355 }, { "epoch": 5.117664581471552, "grad_norm": 0.19661085307598114, "learning_rate": 2.8327898202081327e-05, "loss": 0.7967, "num_input_tokens_seen": 19813200, "step": 34360 }, { "epoch": 5.118409294012511, "grad_norm": 0.22026576101779938, "learning_rate": 2.8321457083421665e-05, "loss": 0.791, "num_input_tokens_seen": 19815888, "step": 34365 }, { "epoch": 5.119154006553471, "grad_norm": 0.20402850210666656, "learning_rate": 2.831501574031145e-05, "loss": 0.7832, "num_input_tokens_seen": 19818768, "step": 34370 }, { "epoch": 5.119898719094429, "grad_norm": 0.2761077880859375, "learning_rate": 2.8308574173185943e-05, "loss": 0.7999, "num_input_tokens_seen": 19822000, "step": 34375 }, { "epoch": 5.120643431635389, "grad_norm": 0.18704555928707123, "learning_rate": 2.8302132382480447e-05, "loss": 0.7766, "num_input_tokens_seen": 19824976, "step": 34380 }, { "epoch": 5.121388144176348, "grad_norm": 0.3350652754306793, "learning_rate": 2.8295690368630263e-05, "loss": 0.797, "num_input_tokens_seen": 19827952, "step": 34385 }, { "epoch": 5.1221328567173074, "grad_norm": 0.21274249255657196, "learning_rate": 2.8289248132070727e-05, "loss": 0.7935, "num_input_tokens_seen": 19830704, "step": 34390 }, { "epoch": 5.122877569258266, "grad_norm": 0.25078269839286804, "learning_rate": 2.828280567323718e-05, "loss": 0.8279, "num_input_tokens_seen": 19833616, "step": 34395 }, { "epoch": 5.123622281799226, "grad_norm": 0.2441975325345993, "learning_rate": 2.827636299256497e-05, "loss": 0.7814, "num_input_tokens_seen": 19836432, "step": 34400 }, { "epoch": 5.124366994340185, "grad_norm": 0.2475612759590149, "learning_rate": 2.826992009048947e-05, "loss": 0.8343, "num_input_tokens_seen": 19838992, "step": 34405 }, { "epoch": 5.125111706881144, "grad_norm": 0.14482688903808594, "learning_rate": 2.8263476967446062e-05, "loss": 0.7726, "num_input_tokens_seen": 19841840, "step": 34410 }, { "epoch": 5.125856419422103, "grad_norm": 0.20311735570430756, "learning_rate": 2.8257033623870145e-05, "loss": 0.7852, "num_input_tokens_seen": 19844912, "step": 34415 }, { "epoch": 5.126601131963063, "grad_norm": 0.20928817987442017, "learning_rate": 2.825059006019715e-05, "loss": 0.7999, "num_input_tokens_seen": 19847824, "step": 34420 }, { "epoch": 5.127345844504021, "grad_norm": 0.15869364142417908, "learning_rate": 2.824414627686249e-05, "loss": 0.7808, "num_input_tokens_seen": 19850704, "step": 34425 }, { "epoch": 5.128090557044981, "grad_norm": 0.17575566470623016, "learning_rate": 2.8237702274301602e-05, "loss": 0.8499, "num_input_tokens_seen": 19853424, "step": 34430 }, { "epoch": 5.12883526958594, "grad_norm": 0.22092296183109283, "learning_rate": 2.823125805294997e-05, "loss": 0.7807, "num_input_tokens_seen": 19856048, "step": 34435 }, { "epoch": 5.129579982126899, "grad_norm": 0.2749479413032532, "learning_rate": 2.8224813613243062e-05, "loss": 0.7925, "num_input_tokens_seen": 19859248, "step": 34440 }, { "epoch": 5.130324694667858, "grad_norm": 0.2055119425058365, "learning_rate": 2.8218368955616347e-05, "loss": 0.8253, "num_input_tokens_seen": 19862128, "step": 34445 }, { "epoch": 5.131069407208817, "grad_norm": 0.2507551610469818, "learning_rate": 2.8211924080505348e-05, "loss": 0.8022, "num_input_tokens_seen": 19864720, "step": 34450 }, { "epoch": 5.131814119749777, "grad_norm": 0.21364964544773102, "learning_rate": 2.8205478988345584e-05, "loss": 0.8308, "num_input_tokens_seen": 19867376, "step": 34455 }, { "epoch": 5.132558832290735, "grad_norm": 0.24021221697330475, "learning_rate": 2.8199033679572578e-05, "loss": 0.8434, "num_input_tokens_seen": 19870192, "step": 34460 }, { "epoch": 5.133303544831695, "grad_norm": 0.24954961240291595, "learning_rate": 2.819258815462188e-05, "loss": 0.8058, "num_input_tokens_seen": 19872976, "step": 34465 }, { "epoch": 5.134048257372654, "grad_norm": 0.26692384481430054, "learning_rate": 2.8186142413929063e-05, "loss": 0.7887, "num_input_tokens_seen": 19875664, "step": 34470 }, { "epoch": 5.1347929699136134, "grad_norm": 0.19616222381591797, "learning_rate": 2.8179696457929684e-05, "loss": 0.8115, "num_input_tokens_seen": 19878448, "step": 34475 }, { "epoch": 5.135537682454572, "grad_norm": 0.19619989395141602, "learning_rate": 2.8173250287059354e-05, "loss": 0.7965, "num_input_tokens_seen": 19881296, "step": 34480 }, { "epoch": 5.136282394995532, "grad_norm": 0.26763588190078735, "learning_rate": 2.816680390175367e-05, "loss": 0.784, "num_input_tokens_seen": 19883632, "step": 34485 }, { "epoch": 5.137027107536491, "grad_norm": 0.19899354875087738, "learning_rate": 2.8160357302448253e-05, "loss": 0.7823, "num_input_tokens_seen": 19886640, "step": 34490 }, { "epoch": 5.13777182007745, "grad_norm": 0.17109963297843933, "learning_rate": 2.8153910489578734e-05, "loss": 0.8068, "num_input_tokens_seen": 19889232, "step": 34495 }, { "epoch": 5.138516532618409, "grad_norm": 0.2616936266422272, "learning_rate": 2.8147463463580776e-05, "loss": 0.7945, "num_input_tokens_seen": 19892144, "step": 34500 }, { "epoch": 5.139261245159369, "grad_norm": 0.17462459206581116, "learning_rate": 2.8141016224890027e-05, "loss": 0.7928, "num_input_tokens_seen": 19895120, "step": 34505 }, { "epoch": 5.140005957700327, "grad_norm": 0.20417535305023193, "learning_rate": 2.8134568773942172e-05, "loss": 0.806, "num_input_tokens_seen": 19897872, "step": 34510 }, { "epoch": 5.140750670241287, "grad_norm": 0.1996048390865326, "learning_rate": 2.81281211111729e-05, "loss": 0.8219, "num_input_tokens_seen": 19900784, "step": 34515 }, { "epoch": 5.141495382782246, "grad_norm": 0.22104379534721375, "learning_rate": 2.8121673237017937e-05, "loss": 0.8035, "num_input_tokens_seen": 19903440, "step": 34520 }, { "epoch": 5.1422400953232055, "grad_norm": 0.2494926005601883, "learning_rate": 2.8115225151912977e-05, "loss": 0.7871, "num_input_tokens_seen": 19906544, "step": 34525 }, { "epoch": 5.142984807864164, "grad_norm": 0.17351819574832916, "learning_rate": 2.810877685629376e-05, "loss": 0.7918, "num_input_tokens_seen": 19909232, "step": 34530 }, { "epoch": 5.143729520405124, "grad_norm": 0.21498878300189972, "learning_rate": 2.8102328350596058e-05, "loss": 0.7946, "num_input_tokens_seen": 19912304, "step": 34535 }, { "epoch": 5.144474232946083, "grad_norm": 0.23359419405460358, "learning_rate": 2.809587963525561e-05, "loss": 0.8152, "num_input_tokens_seen": 19915344, "step": 34540 }, { "epoch": 5.145218945487042, "grad_norm": 0.16905967891216278, "learning_rate": 2.8089430710708203e-05, "loss": 0.7935, "num_input_tokens_seen": 19918256, "step": 34545 }, { "epoch": 5.145963658028001, "grad_norm": 0.2278200089931488, "learning_rate": 2.8082981577389627e-05, "loss": 0.8136, "num_input_tokens_seen": 19921008, "step": 34550 }, { "epoch": 5.146708370568961, "grad_norm": 0.20247666537761688, "learning_rate": 2.80765322357357e-05, "loss": 0.8275, "num_input_tokens_seen": 19924304, "step": 34555 }, { "epoch": 5.1474530831099194, "grad_norm": 0.20265252888202667, "learning_rate": 2.8070082686182232e-05, "loss": 0.8198, "num_input_tokens_seen": 19927216, "step": 34560 }, { "epoch": 5.148197795650879, "grad_norm": 0.21909379959106445, "learning_rate": 2.8063632929165047e-05, "loss": 0.808, "num_input_tokens_seen": 19929872, "step": 34565 }, { "epoch": 5.148942508191838, "grad_norm": 0.25080999732017517, "learning_rate": 2.805718296512001e-05, "loss": 0.8181, "num_input_tokens_seen": 19932880, "step": 34570 }, { "epoch": 5.1496872207327975, "grad_norm": 0.20374436676502228, "learning_rate": 2.8050732794482976e-05, "loss": 0.8156, "num_input_tokens_seen": 19935920, "step": 34575 }, { "epoch": 5.150431933273756, "grad_norm": 0.18543203175067902, "learning_rate": 2.804428241768983e-05, "loss": 0.8173, "num_input_tokens_seen": 19938640, "step": 34580 }, { "epoch": 5.151176645814716, "grad_norm": 0.23365269601345062, "learning_rate": 2.8037831835176454e-05, "loss": 0.7911, "num_input_tokens_seen": 19941712, "step": 34585 }, { "epoch": 5.151921358355675, "grad_norm": 0.21522611379623413, "learning_rate": 2.8031381047378746e-05, "loss": 0.7949, "num_input_tokens_seen": 19944432, "step": 34590 }, { "epoch": 5.152666070896634, "grad_norm": 0.22448624670505524, "learning_rate": 2.8024930054732635e-05, "loss": 0.773, "num_input_tokens_seen": 19947280, "step": 34595 }, { "epoch": 5.153410783437593, "grad_norm": 0.3100428581237793, "learning_rate": 2.8018478857674052e-05, "loss": 0.7965, "num_input_tokens_seen": 19950256, "step": 34600 }, { "epoch": 5.154155495978552, "grad_norm": 0.2972187101840973, "learning_rate": 2.801202745663894e-05, "loss": 0.8141, "num_input_tokens_seen": 19953360, "step": 34605 }, { "epoch": 5.1549002085195115, "grad_norm": 0.17837008833885193, "learning_rate": 2.8005575852063252e-05, "loss": 0.7662, "num_input_tokens_seen": 19955984, "step": 34610 }, { "epoch": 5.15564492106047, "grad_norm": 0.22631052136421204, "learning_rate": 2.7999124044382975e-05, "loss": 0.799, "num_input_tokens_seen": 19958896, "step": 34615 }, { "epoch": 5.15638963360143, "grad_norm": 0.2626420855522156, "learning_rate": 2.7992672034034096e-05, "loss": 0.8231, "num_input_tokens_seen": 19961776, "step": 34620 }, { "epoch": 5.157134346142389, "grad_norm": 0.2046308070421219, "learning_rate": 2.798621982145259e-05, "loss": 0.8008, "num_input_tokens_seen": 19964688, "step": 34625 }, { "epoch": 5.157879058683348, "grad_norm": 0.20043058693408966, "learning_rate": 2.7979767407074496e-05, "loss": 0.7819, "num_input_tokens_seen": 19967472, "step": 34630 }, { "epoch": 5.158623771224307, "grad_norm": 0.1687229722738266, "learning_rate": 2.7973314791335842e-05, "loss": 0.7969, "num_input_tokens_seen": 19970384, "step": 34635 }, { "epoch": 5.159368483765267, "grad_norm": 0.18146638572216034, "learning_rate": 2.796686197467266e-05, "loss": 0.792, "num_input_tokens_seen": 19973648, "step": 34640 }, { "epoch": 5.1601131963062254, "grad_norm": 0.1905648112297058, "learning_rate": 2.7960408957521005e-05, "loss": 0.8101, "num_input_tokens_seen": 19976304, "step": 34645 }, { "epoch": 5.160857908847185, "grad_norm": 0.20139679312705994, "learning_rate": 2.7953955740316944e-05, "loss": 0.8116, "num_input_tokens_seen": 19979248, "step": 34650 }, { "epoch": 5.161602621388144, "grad_norm": 0.19910147786140442, "learning_rate": 2.794750232349658e-05, "loss": 0.7907, "num_input_tokens_seen": 19982160, "step": 34655 }, { "epoch": 5.1623473339291035, "grad_norm": 0.19948355853557587, "learning_rate": 2.7941048707495982e-05, "loss": 0.7702, "num_input_tokens_seen": 19985296, "step": 34660 }, { "epoch": 5.163092046470062, "grad_norm": 0.19816100597381592, "learning_rate": 2.793459489275127e-05, "loss": 0.8313, "num_input_tokens_seen": 19988112, "step": 34665 }, { "epoch": 5.163836759011022, "grad_norm": 0.17632804811000824, "learning_rate": 2.7928140879698567e-05, "loss": 0.799, "num_input_tokens_seen": 19991152, "step": 34670 }, { "epoch": 5.164581471551981, "grad_norm": 0.2620483636856079, "learning_rate": 2.7921686668774005e-05, "loss": 0.8451, "num_input_tokens_seen": 19994192, "step": 34675 }, { "epoch": 5.16532618409294, "grad_norm": 0.18750818073749542, "learning_rate": 2.791523226041374e-05, "loss": 0.7995, "num_input_tokens_seen": 19996880, "step": 34680 }, { "epoch": 5.166070896633899, "grad_norm": 0.2082844227552414, "learning_rate": 2.7908777655053936e-05, "loss": 0.8051, "num_input_tokens_seen": 19999760, "step": 34685 }, { "epoch": 5.166815609174859, "grad_norm": 0.33073946833610535, "learning_rate": 2.7902322853130757e-05, "loss": 0.8155, "num_input_tokens_seen": 20002704, "step": 34690 }, { "epoch": 5.1675603217158175, "grad_norm": 0.20844388008117676, "learning_rate": 2.7895867855080405e-05, "loss": 0.8156, "num_input_tokens_seen": 20005616, "step": 34695 }, { "epoch": 5.168305034256777, "grad_norm": 0.24878183007240295, "learning_rate": 2.7889412661339077e-05, "loss": 0.8188, "num_input_tokens_seen": 20008720, "step": 34700 }, { "epoch": 5.169049746797736, "grad_norm": 0.22336827218532562, "learning_rate": 2.7882957272342986e-05, "loss": 0.7906, "num_input_tokens_seen": 20011696, "step": 34705 }, { "epoch": 5.1697944593386955, "grad_norm": 0.1844795048236847, "learning_rate": 2.7876501688528362e-05, "loss": 0.8015, "num_input_tokens_seen": 20014736, "step": 34710 }, { "epoch": 5.170539171879654, "grad_norm": 0.22795864939689636, "learning_rate": 2.7870045910331444e-05, "loss": 0.7971, "num_input_tokens_seen": 20017616, "step": 34715 }, { "epoch": 5.171283884420614, "grad_norm": 0.21731726825237274, "learning_rate": 2.786358993818851e-05, "loss": 0.813, "num_input_tokens_seen": 20020624, "step": 34720 }, { "epoch": 5.172028596961573, "grad_norm": 0.2271028757095337, "learning_rate": 2.7857133772535798e-05, "loss": 0.7905, "num_input_tokens_seen": 20023376, "step": 34725 }, { "epoch": 5.172773309502532, "grad_norm": 0.17704591155052185, "learning_rate": 2.78506774138096e-05, "loss": 0.7983, "num_input_tokens_seen": 20026064, "step": 34730 }, { "epoch": 5.173518022043491, "grad_norm": 0.1865706890821457, "learning_rate": 2.7844220862446218e-05, "loss": 0.7902, "num_input_tokens_seen": 20028720, "step": 34735 }, { "epoch": 5.174262734584451, "grad_norm": 0.2284601926803589, "learning_rate": 2.7837764118881953e-05, "loss": 0.8055, "num_input_tokens_seen": 20031632, "step": 34740 }, { "epoch": 5.1750074471254095, "grad_norm": 0.19752629101276398, "learning_rate": 2.7831307183553122e-05, "loss": 0.8068, "num_input_tokens_seen": 20034096, "step": 34745 }, { "epoch": 5.175752159666369, "grad_norm": 0.22315771877765656, "learning_rate": 2.782485005689607e-05, "loss": 0.808, "num_input_tokens_seen": 20036784, "step": 34750 }, { "epoch": 5.176496872207328, "grad_norm": 0.22717083990573883, "learning_rate": 2.7818392739347127e-05, "loss": 0.7966, "num_input_tokens_seen": 20039664, "step": 34755 }, { "epoch": 5.1772415847482876, "grad_norm": 0.2203519344329834, "learning_rate": 2.781193523134267e-05, "loss": 0.7775, "num_input_tokens_seen": 20042448, "step": 34760 }, { "epoch": 5.177986297289246, "grad_norm": 0.16559553146362305, "learning_rate": 2.780547753331906e-05, "loss": 0.7835, "num_input_tokens_seen": 20045392, "step": 34765 }, { "epoch": 5.178731009830206, "grad_norm": 0.21629920601844788, "learning_rate": 2.7799019645712682e-05, "loss": 0.7958, "num_input_tokens_seen": 20048208, "step": 34770 }, { "epoch": 5.179475722371165, "grad_norm": 0.24740815162658691, "learning_rate": 2.7792561568959934e-05, "loss": 0.7957, "num_input_tokens_seen": 20051344, "step": 34775 }, { "epoch": 5.180220434912124, "grad_norm": 0.18991400301456451, "learning_rate": 2.778610330349723e-05, "loss": 0.8032, "num_input_tokens_seen": 20054128, "step": 34780 }, { "epoch": 5.180965147453083, "grad_norm": 0.24812793731689453, "learning_rate": 2.7779644849761004e-05, "loss": 0.7857, "num_input_tokens_seen": 20056976, "step": 34785 }, { "epoch": 5.181709859994042, "grad_norm": 0.21870502829551697, "learning_rate": 2.777318620818767e-05, "loss": 0.788, "num_input_tokens_seen": 20059824, "step": 34790 }, { "epoch": 5.1824545725350015, "grad_norm": 0.23413459956645966, "learning_rate": 2.7766727379213686e-05, "loss": 0.827, "num_input_tokens_seen": 20062512, "step": 34795 }, { "epoch": 5.18319928507596, "grad_norm": 0.23628413677215576, "learning_rate": 2.7760268363275523e-05, "loss": 0.7926, "num_input_tokens_seen": 20065424, "step": 34800 }, { "epoch": 5.18394399761692, "grad_norm": 0.27871790528297424, "learning_rate": 2.7753809160809642e-05, "loss": 0.7976, "num_input_tokens_seen": 20068496, "step": 34805 }, { "epoch": 5.184688710157879, "grad_norm": 0.24376444518566132, "learning_rate": 2.7747349772252523e-05, "loss": 0.821, "num_input_tokens_seen": 20071376, "step": 34810 }, { "epoch": 5.185433422698838, "grad_norm": 0.28881508111953735, "learning_rate": 2.7740890198040687e-05, "loss": 0.7929, "num_input_tokens_seen": 20074288, "step": 34815 }, { "epoch": 5.186178135239797, "grad_norm": 0.25920024514198303, "learning_rate": 2.7734430438610637e-05, "loss": 0.7926, "num_input_tokens_seen": 20077008, "step": 34820 }, { "epoch": 5.186922847780757, "grad_norm": 0.19919873774051666, "learning_rate": 2.7727970494398892e-05, "loss": 0.7952, "num_input_tokens_seen": 20079664, "step": 34825 }, { "epoch": 5.1876675603217155, "grad_norm": 0.2691554129123688, "learning_rate": 2.772151036584199e-05, "loss": 0.7918, "num_input_tokens_seen": 20082448, "step": 34830 }, { "epoch": 5.188412272862675, "grad_norm": 0.2524610757827759, "learning_rate": 2.7715050053376484e-05, "loss": 0.7763, "num_input_tokens_seen": 20085136, "step": 34835 }, { "epoch": 5.189156985403634, "grad_norm": 0.22229647636413574, "learning_rate": 2.7708589557438936e-05, "loss": 0.8149, "num_input_tokens_seen": 20088080, "step": 34840 }, { "epoch": 5.1899016979445936, "grad_norm": 0.28147855401039124, "learning_rate": 2.7702128878465917e-05, "loss": 0.7907, "num_input_tokens_seen": 20090704, "step": 34845 }, { "epoch": 5.190646410485552, "grad_norm": 0.2371390163898468, "learning_rate": 2.7695668016894017e-05, "loss": 0.7933, "num_input_tokens_seen": 20093776, "step": 34850 }, { "epoch": 5.191391123026512, "grad_norm": 0.2409917265176773, "learning_rate": 2.7689206973159825e-05, "loss": 0.8226, "num_input_tokens_seen": 20096528, "step": 34855 }, { "epoch": 5.192135835567471, "grad_norm": 0.18571367859840393, "learning_rate": 2.7682745747699962e-05, "loss": 0.7964, "num_input_tokens_seen": 20099280, "step": 34860 }, { "epoch": 5.19288054810843, "grad_norm": 0.2684367299079895, "learning_rate": 2.7676284340951054e-05, "loss": 0.8155, "num_input_tokens_seen": 20102064, "step": 34865 }, { "epoch": 5.193625260649389, "grad_norm": 0.22679810225963593, "learning_rate": 2.766982275334973e-05, "loss": 0.7954, "num_input_tokens_seen": 20105296, "step": 34870 }, { "epoch": 5.194369973190349, "grad_norm": 0.24239952862262726, "learning_rate": 2.7663360985332632e-05, "loss": 0.8118, "num_input_tokens_seen": 20108144, "step": 34875 }, { "epoch": 5.1951146857313075, "grad_norm": 0.2095666527748108, "learning_rate": 2.7656899037336426e-05, "loss": 0.7994, "num_input_tokens_seen": 20111088, "step": 34880 }, { "epoch": 5.195859398272267, "grad_norm": 0.2976871132850647, "learning_rate": 2.76504369097978e-05, "loss": 0.7876, "num_input_tokens_seen": 20114480, "step": 34885 }, { "epoch": 5.196604110813226, "grad_norm": 0.19098979234695435, "learning_rate": 2.7643974603153412e-05, "loss": 0.8219, "num_input_tokens_seen": 20117264, "step": 34890 }, { "epoch": 5.197348823354186, "grad_norm": 0.2048966884613037, "learning_rate": 2.763751211783997e-05, "loss": 0.8258, "num_input_tokens_seen": 20120144, "step": 34895 }, { "epoch": 5.198093535895144, "grad_norm": 0.19155511260032654, "learning_rate": 2.7631049454294182e-05, "loss": 0.8361, "num_input_tokens_seen": 20122896, "step": 34900 }, { "epoch": 5.198838248436104, "grad_norm": 0.19470860064029694, "learning_rate": 2.7624586612952775e-05, "loss": 0.8014, "num_input_tokens_seen": 20125968, "step": 34905 }, { "epoch": 5.199582960977063, "grad_norm": 0.16832612454891205, "learning_rate": 2.761812359425247e-05, "loss": 0.818, "num_input_tokens_seen": 20128528, "step": 34910 }, { "epoch": 5.200327673518022, "grad_norm": 0.2530542314052582, "learning_rate": 2.7611660398630025e-05, "loss": 0.7754, "num_input_tokens_seen": 20131632, "step": 34915 }, { "epoch": 5.201072386058981, "grad_norm": 0.1777404099702835, "learning_rate": 2.7605197026522177e-05, "loss": 0.7883, "num_input_tokens_seen": 20134576, "step": 34920 }, { "epoch": 5.201817098599941, "grad_norm": 0.20810435712337494, "learning_rate": 2.7598733478365713e-05, "loss": 0.8248, "num_input_tokens_seen": 20137584, "step": 34925 }, { "epoch": 5.2025618111408996, "grad_norm": 0.23609589040279388, "learning_rate": 2.75922697545974e-05, "loss": 0.8257, "num_input_tokens_seen": 20140368, "step": 34930 }, { "epoch": 5.203306523681859, "grad_norm": 0.19709689915180206, "learning_rate": 2.7585805855654045e-05, "loss": 0.8019, "num_input_tokens_seen": 20143152, "step": 34935 }, { "epoch": 5.204051236222818, "grad_norm": 0.17217552661895752, "learning_rate": 2.7579341781972436e-05, "loss": 0.7982, "num_input_tokens_seen": 20145616, "step": 34940 }, { "epoch": 5.204795948763778, "grad_norm": 0.3216223418712616, "learning_rate": 2.75728775339894e-05, "loss": 0.7892, "num_input_tokens_seen": 20148432, "step": 34945 }, { "epoch": 5.205540661304736, "grad_norm": 0.20240676403045654, "learning_rate": 2.7566413112141765e-05, "loss": 0.7926, "num_input_tokens_seen": 20151152, "step": 34950 }, { "epoch": 5.206285373845695, "grad_norm": 0.2164032757282257, "learning_rate": 2.7559948516866357e-05, "loss": 0.7926, "num_input_tokens_seen": 20153808, "step": 34955 }, { "epoch": 5.207030086386655, "grad_norm": 0.25050824880599976, "learning_rate": 2.755348374860004e-05, "loss": 0.8127, "num_input_tokens_seen": 20157136, "step": 34960 }, { "epoch": 5.2077747989276135, "grad_norm": 0.2114907056093216, "learning_rate": 2.7547018807779673e-05, "loss": 0.8278, "num_input_tokens_seen": 20160144, "step": 34965 }, { "epoch": 5.208519511468573, "grad_norm": 0.23934811353683472, "learning_rate": 2.7540553694842136e-05, "loss": 0.7701, "num_input_tokens_seen": 20162896, "step": 34970 }, { "epoch": 5.209264224009532, "grad_norm": 0.3032228648662567, "learning_rate": 2.7534088410224302e-05, "loss": 0.8078, "num_input_tokens_seen": 20166320, "step": 34975 }, { "epoch": 5.210008936550492, "grad_norm": 0.21897181868553162, "learning_rate": 2.7527622954363074e-05, "loss": 0.7832, "num_input_tokens_seen": 20169168, "step": 34980 }, { "epoch": 5.21075364909145, "grad_norm": 0.17535872757434845, "learning_rate": 2.752115732769538e-05, "loss": 0.7854, "num_input_tokens_seen": 20171984, "step": 34985 }, { "epoch": 5.21149836163241, "grad_norm": 0.21052159368991852, "learning_rate": 2.7514691530658103e-05, "loss": 0.8036, "num_input_tokens_seen": 20174800, "step": 34990 }, { "epoch": 5.212243074173369, "grad_norm": 0.2803707420825958, "learning_rate": 2.7508225563688206e-05, "loss": 0.7972, "num_input_tokens_seen": 20177808, "step": 34995 }, { "epoch": 5.212987786714328, "grad_norm": 0.25707733631134033, "learning_rate": 2.750175942722262e-05, "loss": 0.7831, "num_input_tokens_seen": 20180848, "step": 35000 }, { "epoch": 5.213732499255287, "grad_norm": 0.3062891364097595, "learning_rate": 2.749529312169831e-05, "loss": 0.8174, "num_input_tokens_seen": 20183792, "step": 35005 }, { "epoch": 5.214477211796247, "grad_norm": 0.18511489033699036, "learning_rate": 2.748882664755223e-05, "loss": 0.8353, "num_input_tokens_seen": 20186448, "step": 35010 }, { "epoch": 5.2152219243372056, "grad_norm": 0.25837966799736023, "learning_rate": 2.748236000522137e-05, "loss": 0.8146, "num_input_tokens_seen": 20189200, "step": 35015 }, { "epoch": 5.215966636878165, "grad_norm": 0.23819129168987274, "learning_rate": 2.7475893195142706e-05, "loss": 0.8076, "num_input_tokens_seen": 20192080, "step": 35020 }, { "epoch": 5.216711349419124, "grad_norm": 0.17329710721969604, "learning_rate": 2.746942621775325e-05, "loss": 0.8251, "num_input_tokens_seen": 20195024, "step": 35025 }, { "epoch": 5.217456061960084, "grad_norm": 0.20762057602405548, "learning_rate": 2.7462959073490007e-05, "loss": 0.7792, "num_input_tokens_seen": 20198160, "step": 35030 }, { "epoch": 5.218200774501042, "grad_norm": 0.25829800963401794, "learning_rate": 2.745649176279001e-05, "loss": 0.7974, "num_input_tokens_seen": 20201296, "step": 35035 }, { "epoch": 5.218945487042002, "grad_norm": 0.2617059648036957, "learning_rate": 2.7450024286090283e-05, "loss": 0.8148, "num_input_tokens_seen": 20204176, "step": 35040 }, { "epoch": 5.219690199582961, "grad_norm": 0.2462710440158844, "learning_rate": 2.7443556643827872e-05, "loss": 0.7877, "num_input_tokens_seen": 20207024, "step": 35045 }, { "epoch": 5.22043491212392, "grad_norm": 0.2632828652858734, "learning_rate": 2.7437088836439844e-05, "loss": 0.7857, "num_input_tokens_seen": 20209968, "step": 35050 }, { "epoch": 5.221179624664879, "grad_norm": 0.24552804231643677, "learning_rate": 2.7430620864363254e-05, "loss": 0.7518, "num_input_tokens_seen": 20212944, "step": 35055 }, { "epoch": 5.221924337205839, "grad_norm": 0.22870203852653503, "learning_rate": 2.7424152728035192e-05, "loss": 0.8127, "num_input_tokens_seen": 20215760, "step": 35060 }, { "epoch": 5.222669049746798, "grad_norm": 0.23901300132274628, "learning_rate": 2.7417684427892747e-05, "loss": 0.7906, "num_input_tokens_seen": 20218512, "step": 35065 }, { "epoch": 5.223413762287757, "grad_norm": 0.28945717215538025, "learning_rate": 2.741121596437302e-05, "loss": 0.8094, "num_input_tokens_seen": 20221360, "step": 35070 }, { "epoch": 5.224158474828716, "grad_norm": 0.2808520495891571, "learning_rate": 2.7404747337913116e-05, "loss": 0.8171, "num_input_tokens_seen": 20224528, "step": 35075 }, { "epoch": 5.224903187369676, "grad_norm": 0.2626647353172302, "learning_rate": 2.739827854895017e-05, "loss": 0.8303, "num_input_tokens_seen": 20227472, "step": 35080 }, { "epoch": 5.225647899910634, "grad_norm": 0.2140585333108902, "learning_rate": 2.73918095979213e-05, "loss": 0.8036, "num_input_tokens_seen": 20230320, "step": 35085 }, { "epoch": 5.226392612451594, "grad_norm": 0.2649135887622833, "learning_rate": 2.7385340485263667e-05, "loss": 0.8127, "num_input_tokens_seen": 20233616, "step": 35090 }, { "epoch": 5.227137324992553, "grad_norm": 0.23882277309894562, "learning_rate": 2.737887121141442e-05, "loss": 0.7776, "num_input_tokens_seen": 20236432, "step": 35095 }, { "epoch": 5.227882037533512, "grad_norm": 0.20514298975467682, "learning_rate": 2.7372401776810736e-05, "loss": 0.8035, "num_input_tokens_seen": 20239440, "step": 35100 }, { "epoch": 5.228626750074471, "grad_norm": 0.25365933775901794, "learning_rate": 2.736593218188978e-05, "loss": 0.7913, "num_input_tokens_seen": 20242512, "step": 35105 }, { "epoch": 5.229371462615431, "grad_norm": 0.28500494360923767, "learning_rate": 2.7359462427088744e-05, "loss": 0.8044, "num_input_tokens_seen": 20245680, "step": 35110 }, { "epoch": 5.23011617515639, "grad_norm": 0.23481491208076477, "learning_rate": 2.7352992512844838e-05, "loss": 0.8171, "num_input_tokens_seen": 20248560, "step": 35115 }, { "epoch": 5.230860887697349, "grad_norm": 0.2610030174255371, "learning_rate": 2.7346522439595256e-05, "loss": 0.8214, "num_input_tokens_seen": 20251504, "step": 35120 }, { "epoch": 5.231605600238308, "grad_norm": 0.1702665537595749, "learning_rate": 2.7340052207777234e-05, "loss": 0.8049, "num_input_tokens_seen": 20254128, "step": 35125 }, { "epoch": 5.232350312779268, "grad_norm": 0.234979048371315, "learning_rate": 2.733358181782799e-05, "loss": 0.7825, "num_input_tokens_seen": 20256976, "step": 35130 }, { "epoch": 5.233095025320226, "grad_norm": 0.2561577260494232, "learning_rate": 2.732711127018478e-05, "loss": 0.8252, "num_input_tokens_seen": 20259760, "step": 35135 }, { "epoch": 5.233839737861185, "grad_norm": 0.22590848803520203, "learning_rate": 2.732064056528485e-05, "loss": 0.7958, "num_input_tokens_seen": 20262864, "step": 35140 }, { "epoch": 5.234584450402145, "grad_norm": 0.24361075460910797, "learning_rate": 2.7314169703565467e-05, "loss": 0.7896, "num_input_tokens_seen": 20265904, "step": 35145 }, { "epoch": 5.235329162943104, "grad_norm": 0.2898080348968506, "learning_rate": 2.7307698685463907e-05, "loss": 0.8309, "num_input_tokens_seen": 20268976, "step": 35150 }, { "epoch": 5.236073875484063, "grad_norm": 0.25891461968421936, "learning_rate": 2.730122751141745e-05, "loss": 0.8002, "num_input_tokens_seen": 20271856, "step": 35155 }, { "epoch": 5.236818588025022, "grad_norm": 0.2605719268321991, "learning_rate": 2.729475618186339e-05, "loss": 0.7687, "num_input_tokens_seen": 20274832, "step": 35160 }, { "epoch": 5.237563300565982, "grad_norm": 0.20376542210578918, "learning_rate": 2.728828469723904e-05, "loss": 0.8273, "num_input_tokens_seen": 20277424, "step": 35165 }, { "epoch": 5.23830801310694, "grad_norm": 0.17778106033802032, "learning_rate": 2.7281813057981715e-05, "loss": 0.7966, "num_input_tokens_seen": 20280528, "step": 35170 }, { "epoch": 5.2390527256479, "grad_norm": 0.15883496403694153, "learning_rate": 2.7275341264528737e-05, "loss": 0.8096, "num_input_tokens_seen": 20283344, "step": 35175 }, { "epoch": 5.239797438188859, "grad_norm": 0.18888415396213531, "learning_rate": 2.7268869317317452e-05, "loss": 0.793, "num_input_tokens_seen": 20286032, "step": 35180 }, { "epoch": 5.240542150729818, "grad_norm": 0.23102924227714539, "learning_rate": 2.72623972167852e-05, "loss": 0.8238, "num_input_tokens_seen": 20289040, "step": 35185 }, { "epoch": 5.241286863270777, "grad_norm": 0.21113519370555878, "learning_rate": 2.7255924963369345e-05, "loss": 0.8001, "num_input_tokens_seen": 20292016, "step": 35190 }, { "epoch": 5.242031575811737, "grad_norm": 0.14152708649635315, "learning_rate": 2.7249452557507243e-05, "loss": 0.8071, "num_input_tokens_seen": 20294864, "step": 35195 }, { "epoch": 5.242776288352696, "grad_norm": 0.22914378345012665, "learning_rate": 2.7242979999636296e-05, "loss": 0.795, "num_input_tokens_seen": 20297808, "step": 35200 }, { "epoch": 5.243521000893655, "grad_norm": 0.1276538372039795, "learning_rate": 2.7236507290193876e-05, "loss": 0.7962, "num_input_tokens_seen": 20300752, "step": 35205 }, { "epoch": 5.244265713434614, "grad_norm": 0.28643372654914856, "learning_rate": 2.7230034429617386e-05, "loss": 0.7758, "num_input_tokens_seen": 20303696, "step": 35210 }, { "epoch": 5.245010425975574, "grad_norm": 0.20381681621074677, "learning_rate": 2.7223561418344234e-05, "loss": 0.7806, "num_input_tokens_seen": 20306320, "step": 35215 }, { "epoch": 5.245755138516532, "grad_norm": 0.2056061029434204, "learning_rate": 2.721708825681184e-05, "loss": 0.7513, "num_input_tokens_seen": 20309360, "step": 35220 }, { "epoch": 5.246499851057492, "grad_norm": 0.21301624178886414, "learning_rate": 2.7210614945457642e-05, "loss": 0.7977, "num_input_tokens_seen": 20312176, "step": 35225 }, { "epoch": 5.247244563598451, "grad_norm": 0.21425656974315643, "learning_rate": 2.7204141484719064e-05, "loss": 0.8096, "num_input_tokens_seen": 20314960, "step": 35230 }, { "epoch": 5.2479892761394105, "grad_norm": 0.18218141794204712, "learning_rate": 2.719766787503357e-05, "loss": 0.8005, "num_input_tokens_seen": 20317808, "step": 35235 }, { "epoch": 5.248733988680369, "grad_norm": 0.23782193660736084, "learning_rate": 2.719119411683862e-05, "loss": 0.7804, "num_input_tokens_seen": 20320624, "step": 35240 }, { "epoch": 5.249478701221329, "grad_norm": 0.26055479049682617, "learning_rate": 2.7184720210571678e-05, "loss": 0.7996, "num_input_tokens_seen": 20323728, "step": 35245 }, { "epoch": 5.250223413762288, "grad_norm": 0.20957961678504944, "learning_rate": 2.717824615667023e-05, "loss": 0.8238, "num_input_tokens_seen": 20326768, "step": 35250 }, { "epoch": 5.250968126303247, "grad_norm": 0.3335433602333069, "learning_rate": 2.7171771955571756e-05, "loss": 0.8607, "num_input_tokens_seen": 20329872, "step": 35255 }, { "epoch": 5.251712838844206, "grad_norm": 0.2736437916755676, "learning_rate": 2.7165297607713763e-05, "loss": 0.8092, "num_input_tokens_seen": 20332720, "step": 35260 }, { "epoch": 5.252457551385166, "grad_norm": 0.16288946568965912, "learning_rate": 2.715882311353377e-05, "loss": 0.777, "num_input_tokens_seen": 20335632, "step": 35265 }, { "epoch": 5.253202263926124, "grad_norm": 0.2706082761287689, "learning_rate": 2.7152348473469285e-05, "loss": 0.8159, "num_input_tokens_seen": 20338576, "step": 35270 }, { "epoch": 5.253946976467084, "grad_norm": 0.20304973423480988, "learning_rate": 2.7145873687957835e-05, "loss": 0.7479, "num_input_tokens_seen": 20342256, "step": 35275 }, { "epoch": 5.254691689008043, "grad_norm": 0.25029662251472473, "learning_rate": 2.7139398757436968e-05, "loss": 0.8102, "num_input_tokens_seen": 20345264, "step": 35280 }, { "epoch": 5.2554364015490025, "grad_norm": 0.20257073640823364, "learning_rate": 2.7132923682344235e-05, "loss": 0.8046, "num_input_tokens_seen": 20348336, "step": 35285 }, { "epoch": 5.256181114089961, "grad_norm": 0.17627573013305664, "learning_rate": 2.7126448463117188e-05, "loss": 0.7909, "num_input_tokens_seen": 20351216, "step": 35290 }, { "epoch": 5.256925826630921, "grad_norm": 0.19883373379707336, "learning_rate": 2.7119973100193397e-05, "loss": 0.7601, "num_input_tokens_seen": 20354032, "step": 35295 }, { "epoch": 5.25767053917188, "grad_norm": 0.33877578377723694, "learning_rate": 2.7113497594010452e-05, "loss": 0.8161, "num_input_tokens_seen": 20356752, "step": 35300 }, { "epoch": 5.258415251712838, "grad_norm": 0.19622883200645447, "learning_rate": 2.710702194500593e-05, "loss": 0.8098, "num_input_tokens_seen": 20359696, "step": 35305 }, { "epoch": 5.259159964253798, "grad_norm": 0.2049250602722168, "learning_rate": 2.7100546153617423e-05, "loss": 0.7934, "num_input_tokens_seen": 20362512, "step": 35310 }, { "epoch": 5.259904676794757, "grad_norm": 0.25114455819129944, "learning_rate": 2.7094070220282553e-05, "loss": 0.811, "num_input_tokens_seen": 20365200, "step": 35315 }, { "epoch": 5.2606493893357165, "grad_norm": 0.19452935457229614, "learning_rate": 2.7087594145438926e-05, "loss": 0.7834, "num_input_tokens_seen": 20368016, "step": 35320 }, { "epoch": 5.261394101876675, "grad_norm": 0.2612780034542084, "learning_rate": 2.7081117929524185e-05, "loss": 0.7864, "num_input_tokens_seen": 20370928, "step": 35325 }, { "epoch": 5.262138814417635, "grad_norm": 0.1915486603975296, "learning_rate": 2.7074641572975944e-05, "loss": 0.7784, "num_input_tokens_seen": 20374160, "step": 35330 }, { "epoch": 5.262883526958594, "grad_norm": 0.22625724971294403, "learning_rate": 2.7068165076231865e-05, "loss": 0.7885, "num_input_tokens_seen": 20377200, "step": 35335 }, { "epoch": 5.263628239499553, "grad_norm": 0.1857072412967682, "learning_rate": 2.7061688439729598e-05, "loss": 0.8103, "num_input_tokens_seen": 20380112, "step": 35340 }, { "epoch": 5.264372952040512, "grad_norm": 0.3204422891139984, "learning_rate": 2.7055211663906814e-05, "loss": 0.8311, "num_input_tokens_seen": 20383056, "step": 35345 }, { "epoch": 5.265117664581472, "grad_norm": 0.15813323855400085, "learning_rate": 2.704873474920118e-05, "loss": 0.7912, "num_input_tokens_seen": 20386032, "step": 35350 }, { "epoch": 5.26586237712243, "grad_norm": 0.20371980965137482, "learning_rate": 2.7042257696050377e-05, "loss": 0.7875, "num_input_tokens_seen": 20389040, "step": 35355 }, { "epoch": 5.26660708966339, "grad_norm": 0.2757081985473633, "learning_rate": 2.70357805048921e-05, "loss": 0.8035, "num_input_tokens_seen": 20392016, "step": 35360 }, { "epoch": 5.267351802204349, "grad_norm": 0.19965197145938873, "learning_rate": 2.7029303176164066e-05, "loss": 0.7734, "num_input_tokens_seen": 20394960, "step": 35365 }, { "epoch": 5.2680965147453085, "grad_norm": 0.2477799504995346, "learning_rate": 2.702282571030396e-05, "loss": 0.7925, "num_input_tokens_seen": 20397712, "step": 35370 }, { "epoch": 5.268841227286267, "grad_norm": 0.18004709482192993, "learning_rate": 2.7016348107749522e-05, "loss": 0.8183, "num_input_tokens_seen": 20400528, "step": 35375 }, { "epoch": 5.269585939827227, "grad_norm": 0.20411886274814606, "learning_rate": 2.7009870368938477e-05, "loss": 0.791, "num_input_tokens_seen": 20403120, "step": 35380 }, { "epoch": 5.270330652368186, "grad_norm": 0.25841695070266724, "learning_rate": 2.700339249430856e-05, "loss": 0.8246, "num_input_tokens_seen": 20406224, "step": 35385 }, { "epoch": 5.271075364909145, "grad_norm": 0.24656610190868378, "learning_rate": 2.6996914484297532e-05, "loss": 0.8027, "num_input_tokens_seen": 20408944, "step": 35390 }, { "epoch": 5.271820077450104, "grad_norm": 0.2676977515220642, "learning_rate": 2.6990436339343134e-05, "loss": 0.812, "num_input_tokens_seen": 20411888, "step": 35395 }, { "epoch": 5.272564789991064, "grad_norm": 0.22451931238174438, "learning_rate": 2.6983958059883145e-05, "loss": 0.8083, "num_input_tokens_seen": 20414672, "step": 35400 }, { "epoch": 5.2733095025320225, "grad_norm": 0.252901166677475, "learning_rate": 2.697747964635533e-05, "loss": 0.8005, "num_input_tokens_seen": 20417616, "step": 35405 }, { "epoch": 5.274054215072982, "grad_norm": 0.22163666784763336, "learning_rate": 2.697100109919749e-05, "loss": 0.8121, "num_input_tokens_seen": 20420368, "step": 35410 }, { "epoch": 5.274798927613941, "grad_norm": 0.2442953735589981, "learning_rate": 2.696452241884741e-05, "loss": 0.7834, "num_input_tokens_seen": 20423440, "step": 35415 }, { "epoch": 5.2755436401549005, "grad_norm": 0.21278993785381317, "learning_rate": 2.6958043605742882e-05, "loss": 0.7821, "num_input_tokens_seen": 20425968, "step": 35420 }, { "epoch": 5.276288352695859, "grad_norm": 0.16073012351989746, "learning_rate": 2.695156466032173e-05, "loss": 0.7811, "num_input_tokens_seen": 20428816, "step": 35425 }, { "epoch": 5.277033065236819, "grad_norm": 0.24975718557834625, "learning_rate": 2.6945085583021783e-05, "loss": 0.8023, "num_input_tokens_seen": 20431536, "step": 35430 }, { "epoch": 5.277777777777778, "grad_norm": 0.29987403750419617, "learning_rate": 2.693860637428085e-05, "loss": 0.8139, "num_input_tokens_seen": 20434704, "step": 35435 }, { "epoch": 5.278522490318737, "grad_norm": 0.24165023863315582, "learning_rate": 2.693212703453678e-05, "loss": 0.8072, "num_input_tokens_seen": 20437488, "step": 35440 }, { "epoch": 5.279267202859696, "grad_norm": 0.19171641767024994, "learning_rate": 2.692564756422743e-05, "loss": 0.7991, "num_input_tokens_seen": 20440240, "step": 35445 }, { "epoch": 5.280011915400656, "grad_norm": 0.18448558449745178, "learning_rate": 2.6919167963790636e-05, "loss": 0.8103, "num_input_tokens_seen": 20443088, "step": 35450 }, { "epoch": 5.2807566279416145, "grad_norm": 0.19400444626808167, "learning_rate": 2.691268823366428e-05, "loss": 0.8125, "num_input_tokens_seen": 20445872, "step": 35455 }, { "epoch": 5.281501340482574, "grad_norm": 0.24451744556427002, "learning_rate": 2.6906208374286223e-05, "loss": 0.8417, "num_input_tokens_seen": 20449008, "step": 35460 }, { "epoch": 5.282246053023533, "grad_norm": 0.21231041848659515, "learning_rate": 2.6899728386094364e-05, "loss": 0.8163, "num_input_tokens_seen": 20452176, "step": 35465 }, { "epoch": 5.282990765564492, "grad_norm": 0.17404644191265106, "learning_rate": 2.6893248269526578e-05, "loss": 0.8177, "num_input_tokens_seen": 20455056, "step": 35470 }, { "epoch": 5.283735478105451, "grad_norm": 0.2872041165828705, "learning_rate": 2.688676802502077e-05, "loss": 0.8084, "num_input_tokens_seen": 20458000, "step": 35475 }, { "epoch": 5.284480190646411, "grad_norm": 0.232387974858284, "learning_rate": 2.688028765301486e-05, "loss": 0.8107, "num_input_tokens_seen": 20460912, "step": 35480 }, { "epoch": 5.28522490318737, "grad_norm": 0.2831839919090271, "learning_rate": 2.687380715394674e-05, "loss": 0.7931, "num_input_tokens_seen": 20463536, "step": 35485 }, { "epoch": 5.2859696157283285, "grad_norm": 0.41464394330978394, "learning_rate": 2.686732652825436e-05, "loss": 0.8038, "num_input_tokens_seen": 20466512, "step": 35490 }, { "epoch": 5.286714328269288, "grad_norm": 0.20530784130096436, "learning_rate": 2.6860845776375643e-05, "loss": 0.7847, "num_input_tokens_seen": 20469488, "step": 35495 }, { "epoch": 5.287459040810247, "grad_norm": 0.20180898904800415, "learning_rate": 2.6854364898748537e-05, "loss": 0.8058, "num_input_tokens_seen": 20472176, "step": 35500 }, { "epoch": 5.2882037533512065, "grad_norm": 0.18494001030921936, "learning_rate": 2.6847883895810984e-05, "loss": 0.8274, "num_input_tokens_seen": 20475184, "step": 35505 }, { "epoch": 5.288948465892165, "grad_norm": 0.1943390965461731, "learning_rate": 2.6841402768000957e-05, "loss": 0.861, "num_input_tokens_seen": 20478320, "step": 35510 }, { "epoch": 5.289693178433125, "grad_norm": 0.2628377079963684, "learning_rate": 2.6834921515756417e-05, "loss": 0.8079, "num_input_tokens_seen": 20480976, "step": 35515 }, { "epoch": 5.290437890974084, "grad_norm": 0.23308883607387543, "learning_rate": 2.6828440139515337e-05, "loss": 0.7881, "num_input_tokens_seen": 20483856, "step": 35520 }, { "epoch": 5.291182603515043, "grad_norm": 0.2198476791381836, "learning_rate": 2.6821958639715704e-05, "loss": 0.7809, "num_input_tokens_seen": 20486992, "step": 35525 }, { "epoch": 5.291927316056002, "grad_norm": 0.20615226030349731, "learning_rate": 2.6815477016795526e-05, "loss": 0.8031, "num_input_tokens_seen": 20489712, "step": 35530 }, { "epoch": 5.292672028596962, "grad_norm": 0.21649175882339478, "learning_rate": 2.6808995271192784e-05, "loss": 0.7902, "num_input_tokens_seen": 20492432, "step": 35535 }, { "epoch": 5.2934167411379205, "grad_norm": 0.1851852685213089, "learning_rate": 2.680251340334549e-05, "loss": 0.7887, "num_input_tokens_seen": 20495504, "step": 35540 }, { "epoch": 5.29416145367888, "grad_norm": 0.20252299308776855, "learning_rate": 2.679603141369168e-05, "loss": 0.7885, "num_input_tokens_seen": 20498512, "step": 35545 }, { "epoch": 5.294906166219839, "grad_norm": 0.21352578699588776, "learning_rate": 2.678954930266937e-05, "loss": 0.8073, "num_input_tokens_seen": 20501456, "step": 35550 }, { "epoch": 5.2956508787607985, "grad_norm": 0.21688644587993622, "learning_rate": 2.6783067070716583e-05, "loss": 0.7751, "num_input_tokens_seen": 20504240, "step": 35555 }, { "epoch": 5.296395591301757, "grad_norm": 0.18451754748821259, "learning_rate": 2.6776584718271376e-05, "loss": 0.8161, "num_input_tokens_seen": 20507024, "step": 35560 }, { "epoch": 5.297140303842717, "grad_norm": 0.18101617693901062, "learning_rate": 2.67701022457718e-05, "loss": 0.8152, "num_input_tokens_seen": 20509808, "step": 35565 }, { "epoch": 5.297885016383676, "grad_norm": 0.2236717939376831, "learning_rate": 2.6763619653655913e-05, "loss": 0.8033, "num_input_tokens_seen": 20512560, "step": 35570 }, { "epoch": 5.298629728924635, "grad_norm": 0.15237681567668915, "learning_rate": 2.6757136942361776e-05, "loss": 0.7886, "num_input_tokens_seen": 20515632, "step": 35575 }, { "epoch": 5.299374441465594, "grad_norm": 0.17733778059482574, "learning_rate": 2.6750654112327474e-05, "loss": 0.7912, "num_input_tokens_seen": 20518608, "step": 35580 }, { "epoch": 5.300119154006554, "grad_norm": 0.2927130162715912, "learning_rate": 2.674417116399108e-05, "loss": 0.774, "num_input_tokens_seen": 20521488, "step": 35585 }, { "epoch": 5.3008638665475125, "grad_norm": 0.18617933988571167, "learning_rate": 2.6737688097790693e-05, "loss": 0.7937, "num_input_tokens_seen": 20524272, "step": 35590 }, { "epoch": 5.301608579088472, "grad_norm": 0.21630439162254333, "learning_rate": 2.6731204914164405e-05, "loss": 0.7763, "num_input_tokens_seen": 20527024, "step": 35595 }, { "epoch": 5.302353291629431, "grad_norm": 0.18876227736473083, "learning_rate": 2.672472161355033e-05, "loss": 0.8295, "num_input_tokens_seen": 20529808, "step": 35600 }, { "epoch": 5.303098004170391, "grad_norm": 0.18617631494998932, "learning_rate": 2.6718238196386576e-05, "loss": 0.8264, "num_input_tokens_seen": 20532848, "step": 35605 }, { "epoch": 5.303842716711349, "grad_norm": 0.23528151214122772, "learning_rate": 2.6711754663111277e-05, "loss": 0.8239, "num_input_tokens_seen": 20535696, "step": 35610 }, { "epoch": 5.304587429252309, "grad_norm": 0.18059174716472626, "learning_rate": 2.6705271014162554e-05, "loss": 0.7998, "num_input_tokens_seen": 20538288, "step": 35615 }, { "epoch": 5.305332141793268, "grad_norm": 0.20051047205924988, "learning_rate": 2.6698787249978546e-05, "loss": 0.8114, "num_input_tokens_seen": 20541456, "step": 35620 }, { "epoch": 5.306076854334227, "grad_norm": 0.19711530208587646, "learning_rate": 2.6692303370997405e-05, "loss": 0.7994, "num_input_tokens_seen": 20544432, "step": 35625 }, { "epoch": 5.306821566875186, "grad_norm": 0.1966572403907776, "learning_rate": 2.668581937765729e-05, "loss": 0.7864, "num_input_tokens_seen": 20547120, "step": 35630 }, { "epoch": 5.307566279416146, "grad_norm": 0.2894478738307953, "learning_rate": 2.667933527039635e-05, "loss": 0.7922, "num_input_tokens_seen": 20550160, "step": 35635 }, { "epoch": 5.3083109919571045, "grad_norm": 0.2809697091579437, "learning_rate": 2.6672851049652752e-05, "loss": 0.7786, "num_input_tokens_seen": 20552848, "step": 35640 }, { "epoch": 5.309055704498064, "grad_norm": 0.20887216925621033, "learning_rate": 2.6666366715864694e-05, "loss": 0.7904, "num_input_tokens_seen": 20555728, "step": 35645 }, { "epoch": 5.309800417039023, "grad_norm": 0.24074754118919373, "learning_rate": 2.665988226947034e-05, "loss": 0.7928, "num_input_tokens_seen": 20558544, "step": 35650 }, { "epoch": 5.310545129579982, "grad_norm": 0.2666606903076172, "learning_rate": 2.6653397710907895e-05, "loss": 0.7943, "num_input_tokens_seen": 20561264, "step": 35655 }, { "epoch": 5.311289842120941, "grad_norm": 0.1281762421131134, "learning_rate": 2.664691304061555e-05, "loss": 0.8029, "num_input_tokens_seen": 20564080, "step": 35660 }, { "epoch": 5.3120345546619, "grad_norm": 0.24452434480190277, "learning_rate": 2.6640428259031525e-05, "loss": 0.783, "num_input_tokens_seen": 20567056, "step": 35665 }, { "epoch": 5.31277926720286, "grad_norm": 0.24129070341587067, "learning_rate": 2.6633943366594027e-05, "loss": 0.7578, "num_input_tokens_seen": 20570064, "step": 35670 }, { "epoch": 5.3135239797438185, "grad_norm": 0.2286379039287567, "learning_rate": 2.6627458363741274e-05, "loss": 0.8016, "num_input_tokens_seen": 20572688, "step": 35675 }, { "epoch": 5.314268692284778, "grad_norm": 0.20220044255256653, "learning_rate": 2.6620973250911506e-05, "loss": 0.835, "num_input_tokens_seen": 20575408, "step": 35680 }, { "epoch": 5.315013404825737, "grad_norm": 0.28377822041511536, "learning_rate": 2.6614488028542948e-05, "loss": 0.7836, "num_input_tokens_seen": 20578224, "step": 35685 }, { "epoch": 5.315758117366697, "grad_norm": 0.2197602093219757, "learning_rate": 2.6608002697073864e-05, "loss": 0.7734, "num_input_tokens_seen": 20581008, "step": 35690 }, { "epoch": 5.316502829907655, "grad_norm": 0.27194803953170776, "learning_rate": 2.6601517256942494e-05, "loss": 0.8114, "num_input_tokens_seen": 20583952, "step": 35695 }, { "epoch": 5.317247542448615, "grad_norm": 0.28208231925964355, "learning_rate": 2.6595031708587093e-05, "loss": 0.815, "num_input_tokens_seen": 20586608, "step": 35700 }, { "epoch": 5.317992254989574, "grad_norm": 0.24638086557388306, "learning_rate": 2.6588546052445933e-05, "loss": 0.8159, "num_input_tokens_seen": 20589232, "step": 35705 }, { "epoch": 5.318736967530533, "grad_norm": 0.2718050181865692, "learning_rate": 2.6582060288957295e-05, "loss": 0.7877, "num_input_tokens_seen": 20591920, "step": 35710 }, { "epoch": 5.319481680071492, "grad_norm": 0.22170130908489227, "learning_rate": 2.6575574418559456e-05, "loss": 0.8504, "num_input_tokens_seen": 20595056, "step": 35715 }, { "epoch": 5.320226392612452, "grad_norm": 0.23238152265548706, "learning_rate": 2.6569088441690697e-05, "loss": 0.799, "num_input_tokens_seen": 20597680, "step": 35720 }, { "epoch": 5.3209711051534105, "grad_norm": 0.22317680716514587, "learning_rate": 2.6562602358789324e-05, "loss": 0.7726, "num_input_tokens_seen": 20600880, "step": 35725 }, { "epoch": 5.32171581769437, "grad_norm": 0.1781301647424698, "learning_rate": 2.6556116170293645e-05, "loss": 0.8249, "num_input_tokens_seen": 20603440, "step": 35730 }, { "epoch": 5.322460530235329, "grad_norm": 0.19475626945495605, "learning_rate": 2.6549629876641953e-05, "loss": 0.812, "num_input_tokens_seen": 20606544, "step": 35735 }, { "epoch": 5.323205242776289, "grad_norm": 0.19203200936317444, "learning_rate": 2.654314347827257e-05, "loss": 0.7593, "num_input_tokens_seen": 20609616, "step": 35740 }, { "epoch": 5.323949955317247, "grad_norm": 0.23244714736938477, "learning_rate": 2.653665697562383e-05, "loss": 0.7932, "num_input_tokens_seen": 20612688, "step": 35745 }, { "epoch": 5.324694667858207, "grad_norm": 0.22659042477607727, "learning_rate": 2.6530170369134062e-05, "loss": 0.7745, "num_input_tokens_seen": 20615824, "step": 35750 }, { "epoch": 5.325439380399166, "grad_norm": 0.26887306571006775, "learning_rate": 2.6523683659241594e-05, "loss": 0.7852, "num_input_tokens_seen": 20618704, "step": 35755 }, { "epoch": 5.326184092940125, "grad_norm": 0.21807411313056946, "learning_rate": 2.651719684638479e-05, "loss": 0.787, "num_input_tokens_seen": 20621680, "step": 35760 }, { "epoch": 5.326928805481084, "grad_norm": 0.1914103925228119, "learning_rate": 2.651070993100198e-05, "loss": 0.8161, "num_input_tokens_seen": 20624880, "step": 35765 }, { "epoch": 5.327673518022044, "grad_norm": 0.2227652668952942, "learning_rate": 2.6504222913531545e-05, "loss": 0.8196, "num_input_tokens_seen": 20627856, "step": 35770 }, { "epoch": 5.328418230563003, "grad_norm": 0.22274746000766754, "learning_rate": 2.6497735794411833e-05, "loss": 0.8137, "num_input_tokens_seen": 20630832, "step": 35775 }, { "epoch": 5.329162943103962, "grad_norm": 0.22393450140953064, "learning_rate": 2.6491248574081228e-05, "loss": 0.8033, "num_input_tokens_seen": 20633872, "step": 35780 }, { "epoch": 5.329907655644921, "grad_norm": 0.21497885882854462, "learning_rate": 2.6484761252978107e-05, "loss": 0.805, "num_input_tokens_seen": 20636624, "step": 35785 }, { "epoch": 5.330652368185881, "grad_norm": 0.2997853755950928, "learning_rate": 2.6478273831540863e-05, "loss": 0.8101, "num_input_tokens_seen": 20639536, "step": 35790 }, { "epoch": 5.331397080726839, "grad_norm": 0.18108700215816498, "learning_rate": 2.647178631020788e-05, "loss": 0.7937, "num_input_tokens_seen": 20642352, "step": 35795 }, { "epoch": 5.332141793267799, "grad_norm": 0.19611217081546783, "learning_rate": 2.6465298689417555e-05, "loss": 0.7907, "num_input_tokens_seen": 20645072, "step": 35800 }, { "epoch": 5.332886505808758, "grad_norm": 0.20012809336185455, "learning_rate": 2.6458810969608304e-05, "loss": 0.8014, "num_input_tokens_seen": 20647888, "step": 35805 }, { "epoch": 5.333631218349717, "grad_norm": 0.3155653774738312, "learning_rate": 2.645232315121855e-05, "loss": 0.7874, "num_input_tokens_seen": 20650448, "step": 35810 }, { "epoch": 5.334375930890676, "grad_norm": 0.25925126671791077, "learning_rate": 2.6445835234686693e-05, "loss": 0.774, "num_input_tokens_seen": 20653296, "step": 35815 }, { "epoch": 5.335120643431635, "grad_norm": 0.28925010561943054, "learning_rate": 2.643934722045117e-05, "loss": 0.7996, "num_input_tokens_seen": 20656208, "step": 35820 }, { "epoch": 5.335865355972595, "grad_norm": 0.23191127181053162, "learning_rate": 2.6432859108950413e-05, "loss": 0.8061, "num_input_tokens_seen": 20659376, "step": 35825 }, { "epoch": 5.336610068513554, "grad_norm": 0.2680060863494873, "learning_rate": 2.642637090062287e-05, "loss": 0.8179, "num_input_tokens_seen": 20662384, "step": 35830 }, { "epoch": 5.337354781054513, "grad_norm": 0.2570877969264984, "learning_rate": 2.6419882595906976e-05, "loss": 0.8033, "num_input_tokens_seen": 20665168, "step": 35835 }, { "epoch": 5.338099493595472, "grad_norm": 0.2286909520626068, "learning_rate": 2.6413394195241186e-05, "loss": 0.7847, "num_input_tokens_seen": 20667888, "step": 35840 }, { "epoch": 5.338844206136431, "grad_norm": 0.20860999822616577, "learning_rate": 2.6406905699063965e-05, "loss": 0.8039, "num_input_tokens_seen": 20670928, "step": 35845 }, { "epoch": 5.33958891867739, "grad_norm": 0.14463582634925842, "learning_rate": 2.640041710781378e-05, "loss": 0.8025, "num_input_tokens_seen": 20673840, "step": 35850 }, { "epoch": 5.34033363121835, "grad_norm": 0.20556889474391937, "learning_rate": 2.6393928421929098e-05, "loss": 0.838, "num_input_tokens_seen": 20676816, "step": 35855 }, { "epoch": 5.341078343759309, "grad_norm": 0.2105233520269394, "learning_rate": 2.6387439641848405e-05, "loss": 0.8124, "num_input_tokens_seen": 20679792, "step": 35860 }, { "epoch": 5.341823056300268, "grad_norm": 0.15089793503284454, "learning_rate": 2.638095076801017e-05, "loss": 0.8162, "num_input_tokens_seen": 20682800, "step": 35865 }, { "epoch": 5.342567768841227, "grad_norm": 0.16144225001335144, "learning_rate": 2.6374461800852907e-05, "loss": 0.82, "num_input_tokens_seen": 20685520, "step": 35870 }, { "epoch": 5.343312481382187, "grad_norm": 0.18864040076732635, "learning_rate": 2.63679727408151e-05, "loss": 0.7994, "num_input_tokens_seen": 20688432, "step": 35875 }, { "epoch": 5.344057193923145, "grad_norm": 0.19309067726135254, "learning_rate": 2.6361483588335257e-05, "loss": 0.8251, "num_input_tokens_seen": 20691408, "step": 35880 }, { "epoch": 5.344801906464105, "grad_norm": 0.18698759377002716, "learning_rate": 2.6354994343851884e-05, "loss": 0.8101, "num_input_tokens_seen": 20694448, "step": 35885 }, { "epoch": 5.345546619005064, "grad_norm": 0.20605415105819702, "learning_rate": 2.6348505007803515e-05, "loss": 0.8257, "num_input_tokens_seen": 20697104, "step": 35890 }, { "epoch": 5.346291331546023, "grad_norm": 0.21652638912200928, "learning_rate": 2.6342015580628655e-05, "loss": 0.7908, "num_input_tokens_seen": 20699856, "step": 35895 }, { "epoch": 5.347036044086982, "grad_norm": 0.15898504853248596, "learning_rate": 2.633552606276583e-05, "loss": 0.8151, "num_input_tokens_seen": 20702608, "step": 35900 }, { "epoch": 5.347780756627942, "grad_norm": 0.17340654134750366, "learning_rate": 2.6329036454653588e-05, "loss": 0.8135, "num_input_tokens_seen": 20705424, "step": 35905 }, { "epoch": 5.348525469168901, "grad_norm": 0.18749283254146576, "learning_rate": 2.632254675673047e-05, "loss": 0.8092, "num_input_tokens_seen": 20708112, "step": 35910 }, { "epoch": 5.34927018170986, "grad_norm": 0.20726501941680908, "learning_rate": 2.6316056969435022e-05, "loss": 0.7747, "num_input_tokens_seen": 20711312, "step": 35915 }, { "epoch": 5.350014894250819, "grad_norm": 0.1812106817960739, "learning_rate": 2.6309567093205784e-05, "loss": 0.7883, "num_input_tokens_seen": 20714288, "step": 35920 }, { "epoch": 5.350759606791779, "grad_norm": 0.17478716373443604, "learning_rate": 2.6303077128481335e-05, "loss": 0.8293, "num_input_tokens_seen": 20717296, "step": 35925 }, { "epoch": 5.351504319332737, "grad_norm": 0.21222993731498718, "learning_rate": 2.629658707570023e-05, "loss": 0.7935, "num_input_tokens_seen": 20720240, "step": 35930 }, { "epoch": 5.352249031873697, "grad_norm": 0.2597534954547882, "learning_rate": 2.6290096935301034e-05, "loss": 0.7858, "num_input_tokens_seen": 20723120, "step": 35935 }, { "epoch": 5.352993744414656, "grad_norm": 0.20417055487632751, "learning_rate": 2.6283606707722336e-05, "loss": 0.7993, "num_input_tokens_seen": 20726096, "step": 35940 }, { "epoch": 5.3537384569556155, "grad_norm": 0.25671881437301636, "learning_rate": 2.6277116393402718e-05, "loss": 0.8014, "num_input_tokens_seen": 20728976, "step": 35945 }, { "epoch": 5.354483169496574, "grad_norm": 0.15046758949756622, "learning_rate": 2.6270625992780772e-05, "loss": 0.8201, "num_input_tokens_seen": 20731728, "step": 35950 }, { "epoch": 5.355227882037534, "grad_norm": 0.19428348541259766, "learning_rate": 2.626413550629508e-05, "loss": 0.8083, "num_input_tokens_seen": 20734736, "step": 35955 }, { "epoch": 5.355972594578493, "grad_norm": 0.20327195525169373, "learning_rate": 2.625764493438425e-05, "loss": 0.7998, "num_input_tokens_seen": 20737488, "step": 35960 }, { "epoch": 5.356717307119452, "grad_norm": 0.26297664642333984, "learning_rate": 2.625115427748689e-05, "loss": 0.7992, "num_input_tokens_seen": 20740688, "step": 35965 }, { "epoch": 5.357462019660411, "grad_norm": 0.24903930723667145, "learning_rate": 2.6244663536041614e-05, "loss": 0.7872, "num_input_tokens_seen": 20743440, "step": 35970 }, { "epoch": 5.358206732201371, "grad_norm": 0.22890977561473846, "learning_rate": 2.623817271048703e-05, "loss": 0.8379, "num_input_tokens_seen": 20746384, "step": 35975 }, { "epoch": 5.358951444742329, "grad_norm": 0.2108873426914215, "learning_rate": 2.623168180126177e-05, "loss": 0.82, "num_input_tokens_seen": 20749072, "step": 35980 }, { "epoch": 5.359696157283288, "grad_norm": 0.1779668629169464, "learning_rate": 2.6225190808804463e-05, "loss": 0.806, "num_input_tokens_seen": 20751952, "step": 35985 }, { "epoch": 5.360440869824248, "grad_norm": 0.28434664011001587, "learning_rate": 2.6218699733553742e-05, "loss": 0.791, "num_input_tokens_seen": 20755024, "step": 35990 }, { "epoch": 5.3611855823652075, "grad_norm": 0.20204965770244598, "learning_rate": 2.6212208575948257e-05, "loss": 0.7901, "num_input_tokens_seen": 20757840, "step": 35995 }, { "epoch": 5.361930294906166, "grad_norm": 0.2164333462715149, "learning_rate": 2.6205717336426632e-05, "loss": 0.8101, "num_input_tokens_seen": 20760944, "step": 36000 }, { "epoch": 5.362675007447125, "grad_norm": 0.17811785638332367, "learning_rate": 2.6199226015427532e-05, "loss": 0.8038, "num_input_tokens_seen": 20763728, "step": 36005 }, { "epoch": 5.363419719988085, "grad_norm": 0.24304531514644623, "learning_rate": 2.619273461338962e-05, "loss": 0.8114, "num_input_tokens_seen": 20766736, "step": 36010 }, { "epoch": 5.364164432529043, "grad_norm": 0.3150557279586792, "learning_rate": 2.6186243130751554e-05, "loss": 0.7745, "num_input_tokens_seen": 20769680, "step": 36015 }, { "epoch": 5.364909145070003, "grad_norm": 0.19464392960071564, "learning_rate": 2.6179751567951992e-05, "loss": 0.7997, "num_input_tokens_seen": 20772592, "step": 36020 }, { "epoch": 5.365653857610962, "grad_norm": 0.18508461117744446, "learning_rate": 2.617325992542962e-05, "loss": 0.8064, "num_input_tokens_seen": 20775376, "step": 36025 }, { "epoch": 5.3663985701519215, "grad_norm": 0.18209950625896454, "learning_rate": 2.616676820362311e-05, "loss": 0.7875, "num_input_tokens_seen": 20778544, "step": 36030 }, { "epoch": 5.36714328269288, "grad_norm": 0.2069060355424881, "learning_rate": 2.6160276402971153e-05, "loss": 0.7954, "num_input_tokens_seen": 20781584, "step": 36035 }, { "epoch": 5.36788799523384, "grad_norm": 0.22833067178726196, "learning_rate": 2.615378452391243e-05, "loss": 0.7917, "num_input_tokens_seen": 20784400, "step": 36040 }, { "epoch": 5.368632707774799, "grad_norm": 0.2451973855495453, "learning_rate": 2.614729256688564e-05, "loss": 0.8425, "num_input_tokens_seen": 20787248, "step": 36045 }, { "epoch": 5.369377420315758, "grad_norm": 0.2059842199087143, "learning_rate": 2.6140800532329486e-05, "loss": 0.8325, "num_input_tokens_seen": 20789936, "step": 36050 }, { "epoch": 5.370122132856717, "grad_norm": 0.1910001039505005, "learning_rate": 2.6134308420682667e-05, "loss": 0.8217, "num_input_tokens_seen": 20793200, "step": 36055 }, { "epoch": 5.370866845397677, "grad_norm": 0.21246075630187988, "learning_rate": 2.61278162323839e-05, "loss": 0.8135, "num_input_tokens_seen": 20796240, "step": 36060 }, { "epoch": 5.371611557938635, "grad_norm": 0.21007227897644043, "learning_rate": 2.612132396787189e-05, "loss": 0.7888, "num_input_tokens_seen": 20798960, "step": 36065 }, { "epoch": 5.372356270479595, "grad_norm": 0.17013101279735565, "learning_rate": 2.6114831627585367e-05, "loss": 0.8226, "num_input_tokens_seen": 20801872, "step": 36070 }, { "epoch": 5.373100983020554, "grad_norm": 0.19031667709350586, "learning_rate": 2.610833921196306e-05, "loss": 0.7917, "num_input_tokens_seen": 20804784, "step": 36075 }, { "epoch": 5.3738456955615135, "grad_norm": 0.18313147127628326, "learning_rate": 2.61018467214437e-05, "loss": 0.7905, "num_input_tokens_seen": 20807856, "step": 36080 }, { "epoch": 5.374590408102472, "grad_norm": 0.1737482249736786, "learning_rate": 2.609535415646601e-05, "loss": 0.7812, "num_input_tokens_seen": 20810544, "step": 36085 }, { "epoch": 5.375335120643432, "grad_norm": 0.30276912450790405, "learning_rate": 2.6088861517468745e-05, "loss": 0.8092, "num_input_tokens_seen": 20813360, "step": 36090 }, { "epoch": 5.376079833184391, "grad_norm": 0.21740196645259857, "learning_rate": 2.6082368804890644e-05, "loss": 0.7822, "num_input_tokens_seen": 20816272, "step": 36095 }, { "epoch": 5.37682454572535, "grad_norm": 0.15489067137241364, "learning_rate": 2.6075876019170453e-05, "loss": 0.7907, "num_input_tokens_seen": 20819088, "step": 36100 }, { "epoch": 5.377569258266309, "grad_norm": 0.21382151544094086, "learning_rate": 2.606938316074694e-05, "loss": 0.8171, "num_input_tokens_seen": 20822000, "step": 36105 }, { "epoch": 5.378313970807269, "grad_norm": 0.25734376907348633, "learning_rate": 2.606289023005886e-05, "loss": 0.7677, "num_input_tokens_seen": 20824784, "step": 36110 }, { "epoch": 5.3790586833482275, "grad_norm": 0.1513800472021103, "learning_rate": 2.6056397227544988e-05, "loss": 0.7999, "num_input_tokens_seen": 20827536, "step": 36115 }, { "epoch": 5.379803395889187, "grad_norm": 0.25526827573776245, "learning_rate": 2.6049904153644072e-05, "loss": 0.785, "num_input_tokens_seen": 20830512, "step": 36120 }, { "epoch": 5.380548108430146, "grad_norm": 0.21430125832557678, "learning_rate": 2.6043411008794915e-05, "loss": 0.8183, "num_input_tokens_seen": 20833168, "step": 36125 }, { "epoch": 5.3812928209711055, "grad_norm": 0.19393005967140198, "learning_rate": 2.603691779343627e-05, "loss": 0.784, "num_input_tokens_seen": 20835920, "step": 36130 }, { "epoch": 5.382037533512064, "grad_norm": 0.16104918718338013, "learning_rate": 2.603042450800695e-05, "loss": 0.7974, "num_input_tokens_seen": 20838896, "step": 36135 }, { "epoch": 5.382782246053024, "grad_norm": 0.31572362780570984, "learning_rate": 2.6023931152945725e-05, "loss": 0.8105, "num_input_tokens_seen": 20841712, "step": 36140 }, { "epoch": 5.383526958593983, "grad_norm": 0.16703571379184723, "learning_rate": 2.6017437728691396e-05, "loss": 0.7853, "num_input_tokens_seen": 20844848, "step": 36145 }, { "epoch": 5.384271671134942, "grad_norm": 0.32262229919433594, "learning_rate": 2.601094423568276e-05, "loss": 0.7956, "num_input_tokens_seen": 20847888, "step": 36150 }, { "epoch": 5.385016383675901, "grad_norm": 0.19223618507385254, "learning_rate": 2.6004450674358628e-05, "loss": 0.8012, "num_input_tokens_seen": 20850672, "step": 36155 }, { "epoch": 5.385761096216861, "grad_norm": 0.187381774187088, "learning_rate": 2.59979570451578e-05, "loss": 0.7826, "num_input_tokens_seen": 20853392, "step": 36160 }, { "epoch": 5.3865058087578195, "grad_norm": 0.23167334496974945, "learning_rate": 2.599146334851909e-05, "loss": 0.7819, "num_input_tokens_seen": 20856176, "step": 36165 }, { "epoch": 5.387250521298778, "grad_norm": 0.25909167528152466, "learning_rate": 2.598496958488132e-05, "loss": 0.7707, "num_input_tokens_seen": 20859152, "step": 36170 }, { "epoch": 5.387995233839738, "grad_norm": 0.2680748701095581, "learning_rate": 2.5978475754683307e-05, "loss": 0.8184, "num_input_tokens_seen": 20862096, "step": 36175 }, { "epoch": 5.388739946380697, "grad_norm": 0.3282254636287689, "learning_rate": 2.5971981858363886e-05, "loss": 0.8143, "num_input_tokens_seen": 20865200, "step": 36180 }, { "epoch": 5.389484658921656, "grad_norm": 0.12140095978975296, "learning_rate": 2.5965487896361878e-05, "loss": 0.7732, "num_input_tokens_seen": 20867920, "step": 36185 }, { "epoch": 5.390229371462615, "grad_norm": 0.19774213433265686, "learning_rate": 2.5958993869116134e-05, "loss": 0.782, "num_input_tokens_seen": 20870576, "step": 36190 }, { "epoch": 5.390974084003575, "grad_norm": 0.25462400913238525, "learning_rate": 2.5952499777065474e-05, "loss": 0.7893, "num_input_tokens_seen": 20873328, "step": 36195 }, { "epoch": 5.3917187965445335, "grad_norm": 0.2155199944972992, "learning_rate": 2.594600562064875e-05, "loss": 0.82, "num_input_tokens_seen": 20876496, "step": 36200 }, { "epoch": 5.392463509085493, "grad_norm": 0.18529754877090454, "learning_rate": 2.593951140030481e-05, "loss": 0.8281, "num_input_tokens_seen": 20879600, "step": 36205 }, { "epoch": 5.393208221626452, "grad_norm": 0.14487126469612122, "learning_rate": 2.593301711647252e-05, "loss": 0.7828, "num_input_tokens_seen": 20882448, "step": 36210 }, { "epoch": 5.3939529341674115, "grad_norm": 0.2262430489063263, "learning_rate": 2.592652276959072e-05, "loss": 0.8162, "num_input_tokens_seen": 20885424, "step": 36215 }, { "epoch": 5.39469764670837, "grad_norm": 0.22273913025856018, "learning_rate": 2.592002836009828e-05, "loss": 0.8043, "num_input_tokens_seen": 20888080, "step": 36220 }, { "epoch": 5.39544235924933, "grad_norm": 0.15646319091320038, "learning_rate": 2.5913533888434067e-05, "loss": 0.8285, "num_input_tokens_seen": 20891024, "step": 36225 }, { "epoch": 5.396187071790289, "grad_norm": 0.19234168529510498, "learning_rate": 2.5907039355036944e-05, "loss": 0.7745, "num_input_tokens_seen": 20894064, "step": 36230 }, { "epoch": 5.396931784331248, "grad_norm": 0.1834539771080017, "learning_rate": 2.590054476034579e-05, "loss": 0.7818, "num_input_tokens_seen": 20896976, "step": 36235 }, { "epoch": 5.397676496872207, "grad_norm": 0.27513545751571655, "learning_rate": 2.5894050104799477e-05, "loss": 0.8075, "num_input_tokens_seen": 20899984, "step": 36240 }, { "epoch": 5.398421209413167, "grad_norm": 0.24472177028656006, "learning_rate": 2.5887555388836905e-05, "loss": 0.8105, "num_input_tokens_seen": 20902992, "step": 36245 }, { "epoch": 5.3991659219541255, "grad_norm": 0.23070411384105682, "learning_rate": 2.5881060612896936e-05, "loss": 0.8287, "num_input_tokens_seen": 20905552, "step": 36250 }, { "epoch": 5.399910634495085, "grad_norm": 0.17910248041152954, "learning_rate": 2.587456577741848e-05, "loss": 0.806, "num_input_tokens_seen": 20908528, "step": 36255 }, { "epoch": 5.400655347036044, "grad_norm": 0.20353510975837708, "learning_rate": 2.5868070882840423e-05, "loss": 0.7874, "num_input_tokens_seen": 20911376, "step": 36260 }, { "epoch": 5.4014000595770035, "grad_norm": 0.17896893620491028, "learning_rate": 2.5861575929601663e-05, "loss": 0.7604, "num_input_tokens_seen": 20914256, "step": 36265 }, { "epoch": 5.402144772117962, "grad_norm": 0.2226618528366089, "learning_rate": 2.5855080918141107e-05, "loss": 0.8319, "num_input_tokens_seen": 20916944, "step": 36270 }, { "epoch": 5.402889484658922, "grad_norm": 0.2651623785495758, "learning_rate": 2.5848585848897654e-05, "loss": 0.7921, "num_input_tokens_seen": 20919536, "step": 36275 }, { "epoch": 5.403634197199881, "grad_norm": 0.144557923078537, "learning_rate": 2.584209072231023e-05, "loss": 0.8022, "num_input_tokens_seen": 20922288, "step": 36280 }, { "epoch": 5.40437890974084, "grad_norm": 0.27845123410224915, "learning_rate": 2.583559553881773e-05, "loss": 0.799, "num_input_tokens_seen": 20925104, "step": 36285 }, { "epoch": 5.405123622281799, "grad_norm": 0.19913768768310547, "learning_rate": 2.582910029885909e-05, "loss": 0.7906, "num_input_tokens_seen": 20927856, "step": 36290 }, { "epoch": 5.405868334822759, "grad_norm": 0.25305500626564026, "learning_rate": 2.5822605002873213e-05, "loss": 0.7939, "num_input_tokens_seen": 20930864, "step": 36295 }, { "epoch": 5.4066130473637175, "grad_norm": 0.21059706807136536, "learning_rate": 2.5816109651299035e-05, "loss": 0.8313, "num_input_tokens_seen": 20934288, "step": 36300 }, { "epoch": 5.407357759904677, "grad_norm": 0.23787866532802582, "learning_rate": 2.5809614244575488e-05, "loss": 0.7787, "num_input_tokens_seen": 20937040, "step": 36305 }, { "epoch": 5.408102472445636, "grad_norm": 0.15342286229133606, "learning_rate": 2.58031187831415e-05, "loss": 0.8316, "num_input_tokens_seen": 20939920, "step": 36310 }, { "epoch": 5.408847184986596, "grad_norm": 0.31426531076431274, "learning_rate": 2.5796623267436016e-05, "loss": 0.8039, "num_input_tokens_seen": 20942992, "step": 36315 }, { "epoch": 5.409591897527554, "grad_norm": 0.2014666199684143, "learning_rate": 2.579012769789796e-05, "loss": 0.803, "num_input_tokens_seen": 20945712, "step": 36320 }, { "epoch": 5.410336610068514, "grad_norm": 0.1624308079481125, "learning_rate": 2.5783632074966298e-05, "loss": 0.7847, "num_input_tokens_seen": 20948208, "step": 36325 }, { "epoch": 5.411081322609473, "grad_norm": 0.20683704316616058, "learning_rate": 2.5777136399079955e-05, "loss": 0.8166, "num_input_tokens_seen": 20951280, "step": 36330 }, { "epoch": 5.4118260351504315, "grad_norm": 0.2220277488231659, "learning_rate": 2.5770640670677902e-05, "loss": 0.8044, "num_input_tokens_seen": 20954000, "step": 36335 }, { "epoch": 5.412570747691391, "grad_norm": 0.13838960230350494, "learning_rate": 2.5764144890199078e-05, "loss": 0.7762, "num_input_tokens_seen": 20956976, "step": 36340 }, { "epoch": 5.413315460232351, "grad_norm": 0.1692541241645813, "learning_rate": 2.5757649058082455e-05, "loss": 0.8207, "num_input_tokens_seen": 20960016, "step": 36345 }, { "epoch": 5.4140601727733095, "grad_norm": 0.28157880902290344, "learning_rate": 2.5751153174766983e-05, "loss": 0.8021, "num_input_tokens_seen": 20962672, "step": 36350 }, { "epoch": 5.414804885314268, "grad_norm": 0.1717136800289154, "learning_rate": 2.5744657240691646e-05, "loss": 0.8591, "num_input_tokens_seen": 20965584, "step": 36355 }, { "epoch": 5.415549597855228, "grad_norm": 0.22105038166046143, "learning_rate": 2.5738161256295396e-05, "loss": 0.8254, "num_input_tokens_seen": 20968656, "step": 36360 }, { "epoch": 5.416294310396187, "grad_norm": 0.24014878273010254, "learning_rate": 2.5731665222017202e-05, "loss": 0.8081, "num_input_tokens_seen": 20971760, "step": 36365 }, { "epoch": 5.417039022937146, "grad_norm": 0.2493799775838852, "learning_rate": 2.5725169138296046e-05, "loss": 0.7953, "num_input_tokens_seen": 20974864, "step": 36370 }, { "epoch": 5.417783735478105, "grad_norm": 0.1568395048379898, "learning_rate": 2.571867300557092e-05, "loss": 0.7874, "num_input_tokens_seen": 20977456, "step": 36375 }, { "epoch": 5.418528448019065, "grad_norm": 0.16928677260875702, "learning_rate": 2.5712176824280787e-05, "loss": 0.7747, "num_input_tokens_seen": 20980208, "step": 36380 }, { "epoch": 5.4192731605600235, "grad_norm": 0.1907312273979187, "learning_rate": 2.5705680594864634e-05, "loss": 0.8272, "num_input_tokens_seen": 20983152, "step": 36385 }, { "epoch": 5.420017873100983, "grad_norm": 0.24999091029167175, "learning_rate": 2.5699184317761465e-05, "loss": 0.8131, "num_input_tokens_seen": 20986128, "step": 36390 }, { "epoch": 5.420762585641942, "grad_norm": 0.21152690052986145, "learning_rate": 2.5692687993410263e-05, "loss": 0.7812, "num_input_tokens_seen": 20989040, "step": 36395 }, { "epoch": 5.421507298182902, "grad_norm": 0.2071940153837204, "learning_rate": 2.5686191622250017e-05, "loss": 0.7887, "num_input_tokens_seen": 20991792, "step": 36400 }, { "epoch": 5.42225201072386, "grad_norm": 0.1604226976633072, "learning_rate": 2.567969520471973e-05, "loss": 0.794, "num_input_tokens_seen": 20994576, "step": 36405 }, { "epoch": 5.42299672326482, "grad_norm": 0.1509154587984085, "learning_rate": 2.5673198741258408e-05, "loss": 0.8028, "num_input_tokens_seen": 20996976, "step": 36410 }, { "epoch": 5.423741435805779, "grad_norm": 0.1944771111011505, "learning_rate": 2.5666702232305055e-05, "loss": 0.8016, "num_input_tokens_seen": 20999760, "step": 36415 }, { "epoch": 5.424486148346738, "grad_norm": 0.227998748421669, "learning_rate": 2.5660205678298664e-05, "loss": 0.7824, "num_input_tokens_seen": 21002704, "step": 36420 }, { "epoch": 5.425230860887697, "grad_norm": 0.22687825560569763, "learning_rate": 2.5653709079678274e-05, "loss": 0.822, "num_input_tokens_seen": 21005456, "step": 36425 }, { "epoch": 5.425975573428657, "grad_norm": 0.19230499863624573, "learning_rate": 2.5647212436882867e-05, "loss": 0.7856, "num_input_tokens_seen": 21008144, "step": 36430 }, { "epoch": 5.4267202859696155, "grad_norm": 0.24531066417694092, "learning_rate": 2.5640715750351486e-05, "loss": 0.7905, "num_input_tokens_seen": 21010864, "step": 36435 }, { "epoch": 5.427464998510575, "grad_norm": 0.2662947475910187, "learning_rate": 2.5634219020523132e-05, "loss": 0.8038, "num_input_tokens_seen": 21013936, "step": 36440 }, { "epoch": 5.428209711051534, "grad_norm": 0.15580332279205322, "learning_rate": 2.5627722247836838e-05, "loss": 0.811, "num_input_tokens_seen": 21016496, "step": 36445 }, { "epoch": 5.428954423592494, "grad_norm": 0.17551547288894653, "learning_rate": 2.5621225432731626e-05, "loss": 0.817, "num_input_tokens_seen": 21019376, "step": 36450 }, { "epoch": 5.429699136133452, "grad_norm": 0.18206867575645447, "learning_rate": 2.561472857564653e-05, "loss": 0.7987, "num_input_tokens_seen": 21022224, "step": 36455 }, { "epoch": 5.430443848674412, "grad_norm": 0.20563849806785583, "learning_rate": 2.560823167702057e-05, "loss": 0.8149, "num_input_tokens_seen": 21025072, "step": 36460 }, { "epoch": 5.431188561215371, "grad_norm": 0.17916016280651093, "learning_rate": 2.5601734737292787e-05, "loss": 0.8053, "num_input_tokens_seen": 21027920, "step": 36465 }, { "epoch": 5.43193327375633, "grad_norm": 0.24745610356330872, "learning_rate": 2.5595237756902217e-05, "loss": 0.7915, "num_input_tokens_seen": 21030832, "step": 36470 }, { "epoch": 5.432677986297289, "grad_norm": 0.18335390090942383, "learning_rate": 2.558874073628791e-05, "loss": 0.8067, "num_input_tokens_seen": 21033552, "step": 36475 }, { "epoch": 5.433422698838249, "grad_norm": 0.23004864156246185, "learning_rate": 2.5582243675888885e-05, "loss": 0.7759, "num_input_tokens_seen": 21036240, "step": 36480 }, { "epoch": 5.434167411379208, "grad_norm": 0.24582897126674652, "learning_rate": 2.55757465761442e-05, "loss": 0.7974, "num_input_tokens_seen": 21039248, "step": 36485 }, { "epoch": 5.434912123920167, "grad_norm": 0.277640700340271, "learning_rate": 2.5569249437492903e-05, "loss": 0.8268, "num_input_tokens_seen": 21042864, "step": 36490 }, { "epoch": 5.435656836461126, "grad_norm": 0.2340877503156662, "learning_rate": 2.5562752260374053e-05, "loss": 0.817, "num_input_tokens_seen": 21045744, "step": 36495 }, { "epoch": 5.436401549002086, "grad_norm": 0.3250400125980377, "learning_rate": 2.555625504522668e-05, "loss": 0.8154, "num_input_tokens_seen": 21048560, "step": 36500 }, { "epoch": 5.437146261543044, "grad_norm": 0.18422648310661316, "learning_rate": 2.5549757792489853e-05, "loss": 0.8147, "num_input_tokens_seen": 21051344, "step": 36505 }, { "epoch": 5.437890974084004, "grad_norm": 0.18333490192890167, "learning_rate": 2.554326050260264e-05, "loss": 0.8046, "num_input_tokens_seen": 21054096, "step": 36510 }, { "epoch": 5.438635686624963, "grad_norm": 0.24179448187351227, "learning_rate": 2.5536763176004086e-05, "loss": 0.7835, "num_input_tokens_seen": 21056912, "step": 36515 }, { "epoch": 5.4393803991659215, "grad_norm": 0.37998801469802856, "learning_rate": 2.553026581313326e-05, "loss": 0.7855, "num_input_tokens_seen": 21059664, "step": 36520 }, { "epoch": 5.440125111706881, "grad_norm": 0.22032730281352997, "learning_rate": 2.5523768414429227e-05, "loss": 0.7992, "num_input_tokens_seen": 21062672, "step": 36525 }, { "epoch": 5.44086982424784, "grad_norm": 0.20439577102661133, "learning_rate": 2.551727098033105e-05, "loss": 0.7884, "num_input_tokens_seen": 21065488, "step": 36530 }, { "epoch": 5.4416145367888, "grad_norm": 0.25093087553977966, "learning_rate": 2.5510773511277804e-05, "loss": 0.7931, "num_input_tokens_seen": 21068432, "step": 36535 }, { "epoch": 5.442359249329758, "grad_norm": 0.19822371006011963, "learning_rate": 2.5504276007708566e-05, "loss": 0.815, "num_input_tokens_seen": 21071216, "step": 36540 }, { "epoch": 5.443103961870718, "grad_norm": 0.22367005050182343, "learning_rate": 2.54977784700624e-05, "loss": 0.7865, "num_input_tokens_seen": 21073840, "step": 36545 }, { "epoch": 5.443848674411677, "grad_norm": 0.25542986392974854, "learning_rate": 2.5491280898778386e-05, "loss": 0.8026, "num_input_tokens_seen": 21076880, "step": 36550 }, { "epoch": 5.444593386952636, "grad_norm": 0.22428900003433228, "learning_rate": 2.548478329429561e-05, "loss": 0.8111, "num_input_tokens_seen": 21079920, "step": 36555 }, { "epoch": 5.445338099493595, "grad_norm": 0.1868778020143509, "learning_rate": 2.547828565705316e-05, "loss": 0.8513, "num_input_tokens_seen": 21082672, "step": 36560 }, { "epoch": 5.446082812034555, "grad_norm": 0.22382870316505432, "learning_rate": 2.5471787987490092e-05, "loss": 0.8058, "num_input_tokens_seen": 21085360, "step": 36565 }, { "epoch": 5.446827524575514, "grad_norm": 0.18627764284610748, "learning_rate": 2.5465290286045518e-05, "loss": 0.813, "num_input_tokens_seen": 21088272, "step": 36570 }, { "epoch": 5.447572237116473, "grad_norm": 0.1738867461681366, "learning_rate": 2.5458792553158518e-05, "loss": 0.7931, "num_input_tokens_seen": 21091088, "step": 36575 }, { "epoch": 5.448316949657432, "grad_norm": 0.5273785591125488, "learning_rate": 2.5452294789268187e-05, "loss": 0.8373, "num_input_tokens_seen": 21094160, "step": 36580 }, { "epoch": 5.449061662198392, "grad_norm": 0.3316271901130676, "learning_rate": 2.544579699481361e-05, "loss": 0.7713, "num_input_tokens_seen": 21097072, "step": 36585 }, { "epoch": 5.44980637473935, "grad_norm": 0.3453378975391388, "learning_rate": 2.5439299170233883e-05, "loss": 0.8046, "num_input_tokens_seen": 21100112, "step": 36590 }, { "epoch": 5.45055108728031, "grad_norm": 0.24465802311897278, "learning_rate": 2.54328013159681e-05, "loss": 0.8291, "num_input_tokens_seen": 21103088, "step": 36595 }, { "epoch": 5.451295799821269, "grad_norm": 0.17219358682632446, "learning_rate": 2.5426303432455374e-05, "loss": 0.7923, "num_input_tokens_seen": 21106064, "step": 36600 }, { "epoch": 5.452040512362228, "grad_norm": 0.21067731082439423, "learning_rate": 2.5419805520134788e-05, "loss": 0.7948, "num_input_tokens_seen": 21108880, "step": 36605 }, { "epoch": 5.452785224903187, "grad_norm": 0.2389357089996338, "learning_rate": 2.5413307579445456e-05, "loss": 0.8234, "num_input_tokens_seen": 21111760, "step": 36610 }, { "epoch": 5.453529937444147, "grad_norm": 0.2596932649612427, "learning_rate": 2.540680961082647e-05, "loss": 0.8174, "num_input_tokens_seen": 21114352, "step": 36615 }, { "epoch": 5.454274649985106, "grad_norm": 0.25388291478157043, "learning_rate": 2.5400311614716955e-05, "loss": 0.8177, "num_input_tokens_seen": 21116976, "step": 36620 }, { "epoch": 5.455019362526065, "grad_norm": 0.22108498215675354, "learning_rate": 2.5393813591556002e-05, "loss": 0.8026, "num_input_tokens_seen": 21120080, "step": 36625 }, { "epoch": 5.455764075067024, "grad_norm": 0.20856955647468567, "learning_rate": 2.538731554178273e-05, "loss": 0.8137, "num_input_tokens_seen": 21122960, "step": 36630 }, { "epoch": 5.456508787607984, "grad_norm": 0.24939079582691193, "learning_rate": 2.5380817465836245e-05, "loss": 0.8016, "num_input_tokens_seen": 21126192, "step": 36635 }, { "epoch": 5.457253500148942, "grad_norm": 0.24711836874485016, "learning_rate": 2.5374319364155673e-05, "loss": 0.7723, "num_input_tokens_seen": 21129136, "step": 36640 }, { "epoch": 5.457998212689902, "grad_norm": 0.21031297743320465, "learning_rate": 2.536782123718011e-05, "loss": 0.8145, "num_input_tokens_seen": 21131824, "step": 36645 }, { "epoch": 5.458742925230861, "grad_norm": 0.1562989503145218, "learning_rate": 2.5361323085348687e-05, "loss": 0.7957, "num_input_tokens_seen": 21134704, "step": 36650 }, { "epoch": 5.4594876377718204, "grad_norm": 0.20837543904781342, "learning_rate": 2.5354824909100522e-05, "loss": 0.8037, "num_input_tokens_seen": 21137552, "step": 36655 }, { "epoch": 5.460232350312779, "grad_norm": 0.2109946459531784, "learning_rate": 2.5348326708874732e-05, "loss": 0.7841, "num_input_tokens_seen": 21140304, "step": 36660 }, { "epoch": 5.460977062853739, "grad_norm": 0.20785018801689148, "learning_rate": 2.5341828485110435e-05, "loss": 0.8091, "num_input_tokens_seen": 21143504, "step": 36665 }, { "epoch": 5.461721775394698, "grad_norm": 0.2302817404270172, "learning_rate": 2.5335330238246756e-05, "loss": 0.7997, "num_input_tokens_seen": 21146512, "step": 36670 }, { "epoch": 5.462466487935657, "grad_norm": 0.1955965757369995, "learning_rate": 2.532883196872283e-05, "loss": 0.8017, "num_input_tokens_seen": 21149520, "step": 36675 }, { "epoch": 5.463211200476616, "grad_norm": 0.2979443073272705, "learning_rate": 2.5322333676977778e-05, "loss": 0.8124, "num_input_tokens_seen": 21152432, "step": 36680 }, { "epoch": 5.463955913017575, "grad_norm": 0.21762119233608246, "learning_rate": 2.531583536345072e-05, "loss": 0.832, "num_input_tokens_seen": 21155632, "step": 36685 }, { "epoch": 5.464700625558534, "grad_norm": 0.2544008195400238, "learning_rate": 2.5309337028580792e-05, "loss": 0.7971, "num_input_tokens_seen": 21158640, "step": 36690 }, { "epoch": 5.465445338099494, "grad_norm": 0.1983514428138733, "learning_rate": 2.5302838672807128e-05, "loss": 0.8131, "num_input_tokens_seen": 21161552, "step": 36695 }, { "epoch": 5.466190050640453, "grad_norm": 0.24243851006031036, "learning_rate": 2.529634029656886e-05, "loss": 0.7877, "num_input_tokens_seen": 21164112, "step": 36700 }, { "epoch": 5.466934763181412, "grad_norm": 0.2710299789905548, "learning_rate": 2.528984190030512e-05, "loss": 0.7968, "num_input_tokens_seen": 21167216, "step": 36705 }, { "epoch": 5.467679475722371, "grad_norm": 0.2936908006668091, "learning_rate": 2.5283343484455036e-05, "loss": 0.7933, "num_input_tokens_seen": 21170064, "step": 36710 }, { "epoch": 5.46842418826333, "grad_norm": 0.2573181092739105, "learning_rate": 2.5276845049457754e-05, "loss": 0.8107, "num_input_tokens_seen": 21172944, "step": 36715 }, { "epoch": 5.46916890080429, "grad_norm": 0.2578737735748291, "learning_rate": 2.5270346595752414e-05, "loss": 0.8139, "num_input_tokens_seen": 21175888, "step": 36720 }, { "epoch": 5.469913613345248, "grad_norm": 0.17779606580734253, "learning_rate": 2.526384812377815e-05, "loss": 0.801, "num_input_tokens_seen": 21178832, "step": 36725 }, { "epoch": 5.470658325886208, "grad_norm": 0.20840999484062195, "learning_rate": 2.525734963397409e-05, "loss": 0.7854, "num_input_tokens_seen": 21181840, "step": 36730 }, { "epoch": 5.471403038427167, "grad_norm": 0.3512675166130066, "learning_rate": 2.5250851126779397e-05, "loss": 0.8007, "num_input_tokens_seen": 21185008, "step": 36735 }, { "epoch": 5.4721477509681264, "grad_norm": 0.21940869092941284, "learning_rate": 2.5244352602633215e-05, "loss": 0.7997, "num_input_tokens_seen": 21188176, "step": 36740 }, { "epoch": 5.472892463509085, "grad_norm": 0.1932862550020218, "learning_rate": 2.5237854061974665e-05, "loss": 0.7863, "num_input_tokens_seen": 21190992, "step": 36745 }, { "epoch": 5.473637176050045, "grad_norm": 0.27743634581565857, "learning_rate": 2.5231355505242906e-05, "loss": 0.8252, "num_input_tokens_seen": 21193904, "step": 36750 }, { "epoch": 5.474381888591004, "grad_norm": 0.23604530096054077, "learning_rate": 2.5224856932877083e-05, "loss": 0.8044, "num_input_tokens_seen": 21196752, "step": 36755 }, { "epoch": 5.475126601131963, "grad_norm": 0.22133342921733856, "learning_rate": 2.5218358345316346e-05, "loss": 0.8005, "num_input_tokens_seen": 21199536, "step": 36760 }, { "epoch": 5.475871313672922, "grad_norm": 0.13136228919029236, "learning_rate": 2.521185974299983e-05, "loss": 0.7986, "num_input_tokens_seen": 21202512, "step": 36765 }, { "epoch": 5.476616026213882, "grad_norm": 0.23354554176330566, "learning_rate": 2.52053611263667e-05, "loss": 0.7958, "num_input_tokens_seen": 21205424, "step": 36770 }, { "epoch": 5.47736073875484, "grad_norm": 0.19393935799598694, "learning_rate": 2.5198862495856106e-05, "loss": 0.7807, "num_input_tokens_seen": 21208400, "step": 36775 }, { "epoch": 5.4781054512958, "grad_norm": 0.17908057570457458, "learning_rate": 2.519236385190719e-05, "loss": 0.7707, "num_input_tokens_seen": 21210960, "step": 36780 }, { "epoch": 5.478850163836759, "grad_norm": 0.1829369068145752, "learning_rate": 2.5185865194959103e-05, "loss": 0.7818, "num_input_tokens_seen": 21213776, "step": 36785 }, { "epoch": 5.4795948763777185, "grad_norm": 0.23235562443733215, "learning_rate": 2.517936652545101e-05, "loss": 0.8292, "num_input_tokens_seen": 21216464, "step": 36790 }, { "epoch": 5.480339588918677, "grad_norm": 0.20333905518054962, "learning_rate": 2.5172867843822046e-05, "loss": 0.777, "num_input_tokens_seen": 21219248, "step": 36795 }, { "epoch": 5.481084301459637, "grad_norm": 0.1952379196882248, "learning_rate": 2.516636915051138e-05, "loss": 0.7801, "num_input_tokens_seen": 21222096, "step": 36800 }, { "epoch": 5.481829014000596, "grad_norm": 0.18675842881202698, "learning_rate": 2.515987044595817e-05, "loss": 0.8008, "num_input_tokens_seen": 21224752, "step": 36805 }, { "epoch": 5.482573726541555, "grad_norm": 0.2326582819223404, "learning_rate": 2.5153371730601556e-05, "loss": 0.7786, "num_input_tokens_seen": 21227664, "step": 36810 }, { "epoch": 5.483318439082514, "grad_norm": 0.22921191155910492, "learning_rate": 2.51468730048807e-05, "loss": 0.8028, "num_input_tokens_seen": 21230512, "step": 36815 }, { "epoch": 5.484063151623474, "grad_norm": 0.1587982475757599, "learning_rate": 2.5140374269234772e-05, "loss": 0.8093, "num_input_tokens_seen": 21233328, "step": 36820 }, { "epoch": 5.4848078641644324, "grad_norm": 0.19274479150772095, "learning_rate": 2.5133875524102922e-05, "loss": 0.7934, "num_input_tokens_seen": 21236240, "step": 36825 }, { "epoch": 5.485552576705392, "grad_norm": 0.1850021928548813, "learning_rate": 2.5127376769924306e-05, "loss": 0.8007, "num_input_tokens_seen": 21239024, "step": 36830 }, { "epoch": 5.486297289246351, "grad_norm": 0.2669530212879181, "learning_rate": 2.512087800713808e-05, "loss": 0.7826, "num_input_tokens_seen": 21241936, "step": 36835 }, { "epoch": 5.4870420017873105, "grad_norm": 0.19981254637241364, "learning_rate": 2.511437923618343e-05, "loss": 0.7994, "num_input_tokens_seen": 21244880, "step": 36840 }, { "epoch": 5.487786714328269, "grad_norm": 0.28781089186668396, "learning_rate": 2.510788045749948e-05, "loss": 0.8145, "num_input_tokens_seen": 21247536, "step": 36845 }, { "epoch": 5.488531426869228, "grad_norm": 0.27752214670181274, "learning_rate": 2.5101381671525404e-05, "loss": 0.82, "num_input_tokens_seen": 21250992, "step": 36850 }, { "epoch": 5.489276139410188, "grad_norm": 0.3032416105270386, "learning_rate": 2.5094882878700372e-05, "loss": 0.7782, "num_input_tokens_seen": 21253776, "step": 36855 }, { "epoch": 5.490020851951147, "grad_norm": 0.18445251882076263, "learning_rate": 2.5088384079463544e-05, "loss": 0.7727, "num_input_tokens_seen": 21256432, "step": 36860 }, { "epoch": 5.490765564492106, "grad_norm": 0.22632260620594025, "learning_rate": 2.5081885274254076e-05, "loss": 0.7885, "num_input_tokens_seen": 21259216, "step": 36865 }, { "epoch": 5.491510277033065, "grad_norm": 0.2533699572086334, "learning_rate": 2.5075386463511135e-05, "loss": 0.8244, "num_input_tokens_seen": 21262352, "step": 36870 }, { "epoch": 5.4922549895740245, "grad_norm": 0.19239647686481476, "learning_rate": 2.5068887647673878e-05, "loss": 0.793, "num_input_tokens_seen": 21265136, "step": 36875 }, { "epoch": 5.492999702114983, "grad_norm": 0.20925331115722656, "learning_rate": 2.5062388827181483e-05, "loss": 0.8234, "num_input_tokens_seen": 21267920, "step": 36880 }, { "epoch": 5.493744414655943, "grad_norm": 0.21708692610263824, "learning_rate": 2.5055890002473097e-05, "loss": 0.815, "num_input_tokens_seen": 21270736, "step": 36885 }, { "epoch": 5.494489127196902, "grad_norm": 0.27516835927963257, "learning_rate": 2.5049391173987896e-05, "loss": 0.7847, "num_input_tokens_seen": 21273776, "step": 36890 }, { "epoch": 5.495233839737861, "grad_norm": 0.24744342267513275, "learning_rate": 2.5042892342165036e-05, "loss": 0.8083, "num_input_tokens_seen": 21276688, "step": 36895 }, { "epoch": 5.49597855227882, "grad_norm": 0.2364145666360855, "learning_rate": 2.5036393507443694e-05, "loss": 0.7948, "num_input_tokens_seen": 21279376, "step": 36900 }, { "epoch": 5.49672326481978, "grad_norm": 0.23860600590705872, "learning_rate": 2.5029894670263025e-05, "loss": 0.7975, "num_input_tokens_seen": 21282096, "step": 36905 }, { "epoch": 5.4974679773607384, "grad_norm": 0.2017688900232315, "learning_rate": 2.502339583106219e-05, "loss": 0.7954, "num_input_tokens_seen": 21285456, "step": 36910 }, { "epoch": 5.498212689901698, "grad_norm": 0.29148557782173157, "learning_rate": 2.5016896990280357e-05, "loss": 0.7882, "num_input_tokens_seen": 21288464, "step": 36915 }, { "epoch": 5.498957402442657, "grad_norm": 0.190884068608284, "learning_rate": 2.50103981483567e-05, "loss": 0.8025, "num_input_tokens_seen": 21291472, "step": 36920 }, { "epoch": 5.4997021149836165, "grad_norm": 0.1871921271085739, "learning_rate": 2.5003899305730383e-05, "loss": 0.7837, "num_input_tokens_seen": 21294256, "step": 36925 }, { "epoch": 5.5, "eval_loss": 0.8044227361679077, "eval_runtime": 45.3156, "eval_samples_per_second": 65.849, "eval_steps_per_second": 16.462, "num_input_tokens_seen": 21295472, "step": 36927 }, { "epoch": 5.500446827524575, "grad_norm": 0.2743261754512787, "learning_rate": 2.499740046284056e-05, "loss": 0.7897, "num_input_tokens_seen": 21297424, "step": 36930 }, { "epoch": 5.501191540065535, "grad_norm": 0.19595679640769958, "learning_rate": 2.49909016201264e-05, "loss": 0.7636, "num_input_tokens_seen": 21300368, "step": 36935 }, { "epoch": 5.501936252606494, "grad_norm": 0.17157943546772003, "learning_rate": 2.498440277802708e-05, "loss": 0.776, "num_input_tokens_seen": 21303152, "step": 36940 }, { "epoch": 5.502680965147453, "grad_norm": 0.2567128539085388, "learning_rate": 2.497790393698175e-05, "loss": 0.7785, "num_input_tokens_seen": 21306256, "step": 36945 }, { "epoch": 5.503425677688412, "grad_norm": 0.1764664500951767, "learning_rate": 2.4971405097429595e-05, "loss": 0.8066, "num_input_tokens_seen": 21308880, "step": 36950 }, { "epoch": 5.504170390229372, "grad_norm": 0.24988563358783722, "learning_rate": 2.4964906259809754e-05, "loss": 0.7952, "num_input_tokens_seen": 21311536, "step": 36955 }, { "epoch": 5.5049151027703305, "grad_norm": 0.16698424518108368, "learning_rate": 2.495840742456141e-05, "loss": 0.8004, "num_input_tokens_seen": 21314352, "step": 36960 }, { "epoch": 5.50565981531129, "grad_norm": 0.16390831768512726, "learning_rate": 2.495190859212372e-05, "loss": 0.7768, "num_input_tokens_seen": 21317008, "step": 36965 }, { "epoch": 5.506404527852249, "grad_norm": 0.15846283733844757, "learning_rate": 2.4945409762935855e-05, "loss": 0.7836, "num_input_tokens_seen": 21320080, "step": 36970 }, { "epoch": 5.5071492403932085, "grad_norm": 0.19753125309944153, "learning_rate": 2.4938910937436974e-05, "loss": 0.8103, "num_input_tokens_seen": 21322672, "step": 36975 }, { "epoch": 5.507893952934167, "grad_norm": 0.27892357110977173, "learning_rate": 2.4932412116066243e-05, "loss": 0.8163, "num_input_tokens_seen": 21325840, "step": 36980 }, { "epoch": 5.508638665475127, "grad_norm": 0.26113826036453247, "learning_rate": 2.492591329926283e-05, "loss": 0.8091, "num_input_tokens_seen": 21328720, "step": 36985 }, { "epoch": 5.509383378016086, "grad_norm": 0.18100014328956604, "learning_rate": 2.49194144874659e-05, "loss": 0.7927, "num_input_tokens_seen": 21331472, "step": 36990 }, { "epoch": 5.510128090557045, "grad_norm": 0.26026952266693115, "learning_rate": 2.4912915681114603e-05, "loss": 0.7769, "num_input_tokens_seen": 21334608, "step": 36995 }, { "epoch": 5.510872803098004, "grad_norm": 0.20950786769390106, "learning_rate": 2.490641688064811e-05, "loss": 0.7762, "num_input_tokens_seen": 21337392, "step": 37000 }, { "epoch": 5.511617515638964, "grad_norm": 0.1876445859670639, "learning_rate": 2.4899918086505585e-05, "loss": 0.7953, "num_input_tokens_seen": 21339984, "step": 37005 }, { "epoch": 5.5123622281799225, "grad_norm": 0.268653929233551, "learning_rate": 2.489341929912619e-05, "loss": 0.801, "num_input_tokens_seen": 21342512, "step": 37010 }, { "epoch": 5.513106940720881, "grad_norm": 0.19912183284759521, "learning_rate": 2.488692051894908e-05, "loss": 0.8395, "num_input_tokens_seen": 21345552, "step": 37015 }, { "epoch": 5.513851653261841, "grad_norm": 0.19871383905410767, "learning_rate": 2.488042174641343e-05, "loss": 0.7565, "num_input_tokens_seen": 21348336, "step": 37020 }, { "epoch": 5.5145963658028005, "grad_norm": 0.18960429728031158, "learning_rate": 2.4873922981958383e-05, "loss": 0.812, "num_input_tokens_seen": 21351120, "step": 37025 }, { "epoch": 5.515341078343759, "grad_norm": 0.21742674708366394, "learning_rate": 2.4867424226023123e-05, "loss": 0.8, "num_input_tokens_seen": 21353968, "step": 37030 }, { "epoch": 5.516085790884718, "grad_norm": 0.2683247923851013, "learning_rate": 2.486092547904678e-05, "loss": 0.8269, "num_input_tokens_seen": 21356688, "step": 37035 }, { "epoch": 5.516830503425678, "grad_norm": 0.3467753231525421, "learning_rate": 2.4854426741468537e-05, "loss": 0.8325, "num_input_tokens_seen": 21359856, "step": 37040 }, { "epoch": 5.517575215966637, "grad_norm": 0.231760174036026, "learning_rate": 2.4847928013727537e-05, "loss": 0.8012, "num_input_tokens_seen": 21362896, "step": 37045 }, { "epoch": 5.518319928507596, "grad_norm": 0.22510917484760284, "learning_rate": 2.484142929626294e-05, "loss": 0.8037, "num_input_tokens_seen": 21365776, "step": 37050 }, { "epoch": 5.519064641048555, "grad_norm": 0.3140954375267029, "learning_rate": 2.4834930589513915e-05, "loss": 0.7871, "num_input_tokens_seen": 21369136, "step": 37055 }, { "epoch": 5.5198093535895145, "grad_norm": 0.2577010989189148, "learning_rate": 2.4828431893919608e-05, "loss": 0.8241, "num_input_tokens_seen": 21372144, "step": 37060 }, { "epoch": 5.520554066130473, "grad_norm": 0.20098194479942322, "learning_rate": 2.482193320991917e-05, "loss": 0.7962, "num_input_tokens_seen": 21374768, "step": 37065 }, { "epoch": 5.521298778671433, "grad_norm": 0.21723318099975586, "learning_rate": 2.4815434537951773e-05, "loss": 0.8046, "num_input_tokens_seen": 21377552, "step": 37070 }, { "epoch": 5.522043491212392, "grad_norm": 0.26648110151290894, "learning_rate": 2.4808935878456556e-05, "loss": 0.8335, "num_input_tokens_seen": 21380816, "step": 37075 }, { "epoch": 5.522788203753351, "grad_norm": 0.2039794772863388, "learning_rate": 2.480243723187267e-05, "loss": 0.8023, "num_input_tokens_seen": 21383472, "step": 37080 }, { "epoch": 5.52353291629431, "grad_norm": 0.32386326789855957, "learning_rate": 2.4795938598639273e-05, "loss": 0.7918, "num_input_tokens_seen": 21386608, "step": 37085 }, { "epoch": 5.52427762883527, "grad_norm": 0.23318786919116974, "learning_rate": 2.478943997919552e-05, "loss": 0.7674, "num_input_tokens_seen": 21389456, "step": 37090 }, { "epoch": 5.5250223413762285, "grad_norm": 0.18454328179359436, "learning_rate": 2.4782941373980552e-05, "loss": 0.7971, "num_input_tokens_seen": 21392624, "step": 37095 }, { "epoch": 5.525767053917188, "grad_norm": 0.26897957921028137, "learning_rate": 2.4776442783433523e-05, "loss": 0.7879, "num_input_tokens_seen": 21395504, "step": 37100 }, { "epoch": 5.526511766458147, "grad_norm": 0.24249137938022614, "learning_rate": 2.476994420799359e-05, "loss": 0.8167, "num_input_tokens_seen": 21398064, "step": 37105 }, { "epoch": 5.5272564789991065, "grad_norm": 0.3004131019115448, "learning_rate": 2.4763445648099894e-05, "loss": 0.7803, "num_input_tokens_seen": 21401296, "step": 37110 }, { "epoch": 5.528001191540065, "grad_norm": 0.25463855266571045, "learning_rate": 2.4756947104191573e-05, "loss": 0.7792, "num_input_tokens_seen": 21403952, "step": 37115 }, { "epoch": 5.528745904081025, "grad_norm": 0.22136662900447845, "learning_rate": 2.4750448576707773e-05, "loss": 0.8006, "num_input_tokens_seen": 21406608, "step": 37120 }, { "epoch": 5.529490616621984, "grad_norm": 0.2329426109790802, "learning_rate": 2.474395006608765e-05, "loss": 0.8031, "num_input_tokens_seen": 21409424, "step": 37125 }, { "epoch": 5.530235329162943, "grad_norm": 0.23813223838806152, "learning_rate": 2.4737451572770337e-05, "loss": 0.832, "num_input_tokens_seen": 21412176, "step": 37130 }, { "epoch": 5.530980041703902, "grad_norm": 0.19915610551834106, "learning_rate": 2.4730953097194987e-05, "loss": 0.815, "num_input_tokens_seen": 21415152, "step": 37135 }, { "epoch": 5.531724754244862, "grad_norm": 0.20105241239070892, "learning_rate": 2.4724454639800724e-05, "loss": 0.7912, "num_input_tokens_seen": 21418384, "step": 37140 }, { "epoch": 5.5324694667858205, "grad_norm": 0.23876740038394928, "learning_rate": 2.4717956201026694e-05, "loss": 0.8233, "num_input_tokens_seen": 21421392, "step": 37145 }, { "epoch": 5.53321417932678, "grad_norm": 0.2042243629693985, "learning_rate": 2.4711457781312052e-05, "loss": 0.7705, "num_input_tokens_seen": 21424272, "step": 37150 }, { "epoch": 5.533958891867739, "grad_norm": 0.26755377650260925, "learning_rate": 2.4704959381095914e-05, "loss": 0.7546, "num_input_tokens_seen": 21427472, "step": 37155 }, { "epoch": 5.534703604408699, "grad_norm": 0.1577046811580658, "learning_rate": 2.4698461000817415e-05, "loss": 0.8377, "num_input_tokens_seen": 21430480, "step": 37160 }, { "epoch": 5.535448316949657, "grad_norm": 0.20862270891666412, "learning_rate": 2.46919626409157e-05, "loss": 0.8028, "num_input_tokens_seen": 21433616, "step": 37165 }, { "epoch": 5.536193029490617, "grad_norm": 0.17207078635692596, "learning_rate": 2.46854643018299e-05, "loss": 0.81, "num_input_tokens_seen": 21436432, "step": 37170 }, { "epoch": 5.536937742031576, "grad_norm": 0.24578280746936798, "learning_rate": 2.4678965983999133e-05, "loss": 0.812, "num_input_tokens_seen": 21439376, "step": 37175 }, { "epoch": 5.537682454572535, "grad_norm": 0.19935455918312073, "learning_rate": 2.4672467687862545e-05, "loss": 0.8149, "num_input_tokens_seen": 21442224, "step": 37180 }, { "epoch": 5.538427167113494, "grad_norm": 0.1911114603281021, "learning_rate": 2.4665969413859264e-05, "loss": 0.8073, "num_input_tokens_seen": 21445040, "step": 37185 }, { "epoch": 5.539171879654454, "grad_norm": 0.1872694343328476, "learning_rate": 2.4659471162428404e-05, "loss": 0.794, "num_input_tokens_seen": 21447728, "step": 37190 }, { "epoch": 5.5399165921954125, "grad_norm": 0.19258148968219757, "learning_rate": 2.4652972934009112e-05, "loss": 0.7992, "num_input_tokens_seen": 21450640, "step": 37195 }, { "epoch": 5.540661304736371, "grad_norm": 0.17256642878055573, "learning_rate": 2.4646474729040486e-05, "loss": 0.8004, "num_input_tokens_seen": 21453616, "step": 37200 }, { "epoch": 5.541406017277331, "grad_norm": 0.24554356932640076, "learning_rate": 2.4639976547961665e-05, "loss": 0.8099, "num_input_tokens_seen": 21456624, "step": 37205 }, { "epoch": 5.542150729818291, "grad_norm": 0.1939670592546463, "learning_rate": 2.4633478391211762e-05, "loss": 0.7938, "num_input_tokens_seen": 21459280, "step": 37210 }, { "epoch": 5.542895442359249, "grad_norm": 0.19102934002876282, "learning_rate": 2.46269802592299e-05, "loss": 0.7948, "num_input_tokens_seen": 21462192, "step": 37215 }, { "epoch": 5.543640154900208, "grad_norm": 0.16178305447101593, "learning_rate": 2.4620482152455197e-05, "loss": 0.8171, "num_input_tokens_seen": 21465008, "step": 37220 }, { "epoch": 5.544384867441168, "grad_norm": 0.20878836512565613, "learning_rate": 2.4613984071326762e-05, "loss": 0.7923, "num_input_tokens_seen": 21467760, "step": 37225 }, { "epoch": 5.5451295799821265, "grad_norm": 0.21959148347377777, "learning_rate": 2.4607486016283717e-05, "loss": 0.8094, "num_input_tokens_seen": 21470576, "step": 37230 }, { "epoch": 5.545874292523086, "grad_norm": 0.2808789610862732, "learning_rate": 2.4600987987765183e-05, "loss": 0.8002, "num_input_tokens_seen": 21473840, "step": 37235 }, { "epoch": 5.546619005064045, "grad_norm": 0.2399771511554718, "learning_rate": 2.459448998621025e-05, "loss": 0.7932, "num_input_tokens_seen": 21476848, "step": 37240 }, { "epoch": 5.547363717605005, "grad_norm": 0.2304999828338623, "learning_rate": 2.458799201205803e-05, "loss": 0.7956, "num_input_tokens_seen": 21479632, "step": 37245 }, { "epoch": 5.548108430145963, "grad_norm": 0.24176530539989471, "learning_rate": 2.4581494065747634e-05, "loss": 0.8336, "num_input_tokens_seen": 21482576, "step": 37250 }, { "epoch": 5.548853142686923, "grad_norm": 0.2367209941148758, "learning_rate": 2.4574996147718175e-05, "loss": 0.7909, "num_input_tokens_seen": 21485360, "step": 37255 }, { "epoch": 5.549597855227882, "grad_norm": 0.25432491302490234, "learning_rate": 2.456849825840874e-05, "loss": 0.8034, "num_input_tokens_seen": 21488144, "step": 37260 }, { "epoch": 5.550342567768841, "grad_norm": 0.24175617098808289, "learning_rate": 2.4562000398258442e-05, "loss": 0.7849, "num_input_tokens_seen": 21490960, "step": 37265 }, { "epoch": 5.5510872803098, "grad_norm": 0.2984062433242798, "learning_rate": 2.455550256770638e-05, "loss": 0.8276, "num_input_tokens_seen": 21494192, "step": 37270 }, { "epoch": 5.55183199285076, "grad_norm": 0.21748483180999756, "learning_rate": 2.454900476719165e-05, "loss": 0.8204, "num_input_tokens_seen": 21497072, "step": 37275 }, { "epoch": 5.5525767053917185, "grad_norm": 0.18422436714172363, "learning_rate": 2.454250699715334e-05, "loss": 0.7828, "num_input_tokens_seen": 21499824, "step": 37280 }, { "epoch": 5.553321417932678, "grad_norm": 0.21237677335739136, "learning_rate": 2.453600925803054e-05, "loss": 0.7849, "num_input_tokens_seen": 21502640, "step": 37285 }, { "epoch": 5.554066130473637, "grad_norm": 0.2414228916168213, "learning_rate": 2.4529511550262357e-05, "loss": 0.7924, "num_input_tokens_seen": 21505680, "step": 37290 }, { "epoch": 5.554810843014597, "grad_norm": 0.19729867577552795, "learning_rate": 2.4523013874287863e-05, "loss": 0.7955, "num_input_tokens_seen": 21508720, "step": 37295 }, { "epoch": 5.555555555555555, "grad_norm": 0.23044297099113464, "learning_rate": 2.451651623054616e-05, "loss": 0.7954, "num_input_tokens_seen": 21511472, "step": 37300 }, { "epoch": 5.556300268096515, "grad_norm": 0.24514351785182953, "learning_rate": 2.451001861947632e-05, "loss": 0.8115, "num_input_tokens_seen": 21514320, "step": 37305 }, { "epoch": 5.557044980637474, "grad_norm": 0.25457069277763367, "learning_rate": 2.4503521041517426e-05, "loss": 0.8042, "num_input_tokens_seen": 21517104, "step": 37310 }, { "epoch": 5.557789693178433, "grad_norm": 0.25568556785583496, "learning_rate": 2.4497023497108575e-05, "loss": 0.8386, "num_input_tokens_seen": 21519984, "step": 37315 }, { "epoch": 5.558534405719392, "grad_norm": 0.23153528571128845, "learning_rate": 2.4490525986688826e-05, "loss": 0.8286, "num_input_tokens_seen": 21523120, "step": 37320 }, { "epoch": 5.559279118260352, "grad_norm": 0.1753869205713272, "learning_rate": 2.4484028510697253e-05, "loss": 0.7745, "num_input_tokens_seen": 21525872, "step": 37325 }, { "epoch": 5.560023830801311, "grad_norm": 0.27496641874313354, "learning_rate": 2.4477531069572934e-05, "loss": 0.8327, "num_input_tokens_seen": 21528752, "step": 37330 }, { "epoch": 5.56076854334227, "grad_norm": 0.1859918087720871, "learning_rate": 2.447103366375495e-05, "loss": 0.7966, "num_input_tokens_seen": 21531280, "step": 37335 }, { "epoch": 5.561513255883229, "grad_norm": 0.2741747796535492, "learning_rate": 2.4464536293682353e-05, "loss": 0.7961, "num_input_tokens_seen": 21534576, "step": 37340 }, { "epoch": 5.562257968424189, "grad_norm": 0.23935788869857788, "learning_rate": 2.4458038959794218e-05, "loss": 0.7953, "num_input_tokens_seen": 21537488, "step": 37345 }, { "epoch": 5.563002680965147, "grad_norm": 0.20709773898124695, "learning_rate": 2.4451541662529605e-05, "loss": 0.8229, "num_input_tokens_seen": 21540336, "step": 37350 }, { "epoch": 5.563747393506107, "grad_norm": 0.14857257902622223, "learning_rate": 2.444504440232759e-05, "loss": 0.7804, "num_input_tokens_seen": 21543024, "step": 37355 }, { "epoch": 5.564492106047066, "grad_norm": 0.1995203197002411, "learning_rate": 2.4438547179627203e-05, "loss": 0.8536, "num_input_tokens_seen": 21545872, "step": 37360 }, { "epoch": 5.5652368185880245, "grad_norm": 0.29723823070526123, "learning_rate": 2.443204999486752e-05, "loss": 0.8382, "num_input_tokens_seen": 21549008, "step": 37365 }, { "epoch": 5.565981531128984, "grad_norm": 0.18976496160030365, "learning_rate": 2.4425552848487588e-05, "loss": 0.8408, "num_input_tokens_seen": 21551632, "step": 37370 }, { "epoch": 5.566726243669944, "grad_norm": 0.19121532142162323, "learning_rate": 2.4419055740926456e-05, "loss": 0.8086, "num_input_tokens_seen": 21554416, "step": 37375 }, { "epoch": 5.567470956210903, "grad_norm": 0.258186936378479, "learning_rate": 2.4412558672623177e-05, "loss": 0.8208, "num_input_tokens_seen": 21557040, "step": 37380 }, { "epoch": 5.568215668751861, "grad_norm": 0.2606261670589447, "learning_rate": 2.44060616440168e-05, "loss": 0.8109, "num_input_tokens_seen": 21559888, "step": 37385 }, { "epoch": 5.568960381292821, "grad_norm": 0.15057726204395294, "learning_rate": 2.4399564655546354e-05, "loss": 0.7679, "num_input_tokens_seen": 21563088, "step": 37390 }, { "epoch": 5.569705093833781, "grad_norm": 0.16230244934558868, "learning_rate": 2.43930677076509e-05, "loss": 0.7938, "num_input_tokens_seen": 21565744, "step": 37395 }, { "epoch": 5.570449806374739, "grad_norm": 0.19795061647891998, "learning_rate": 2.4386570800769447e-05, "loss": 0.8111, "num_input_tokens_seen": 21568496, "step": 37400 }, { "epoch": 5.571194518915698, "grad_norm": 0.19887277483940125, "learning_rate": 2.438007393534106e-05, "loss": 0.8006, "num_input_tokens_seen": 21571120, "step": 37405 }, { "epoch": 5.571939231456658, "grad_norm": 0.2602035701274872, "learning_rate": 2.4373577111804744e-05, "loss": 0.7789, "num_input_tokens_seen": 21573712, "step": 37410 }, { "epoch": 5.572683943997617, "grad_norm": 0.22285498678684235, "learning_rate": 2.436708033059954e-05, "loss": 0.8217, "num_input_tokens_seen": 21576240, "step": 37415 }, { "epoch": 5.573428656538576, "grad_norm": 0.21150104701519012, "learning_rate": 2.4360583592164483e-05, "loss": 0.7936, "num_input_tokens_seen": 21579248, "step": 37420 }, { "epoch": 5.574173369079535, "grad_norm": 0.23635654151439667, "learning_rate": 2.435408689693858e-05, "loss": 0.7931, "num_input_tokens_seen": 21582160, "step": 37425 }, { "epoch": 5.574918081620495, "grad_norm": 0.20904812216758728, "learning_rate": 2.4347590245360857e-05, "loss": 0.7816, "num_input_tokens_seen": 21585360, "step": 37430 }, { "epoch": 5.575662794161453, "grad_norm": 0.2705075144767761, "learning_rate": 2.4341093637870345e-05, "loss": 0.8151, "num_input_tokens_seen": 21588464, "step": 37435 }, { "epoch": 5.576407506702413, "grad_norm": 0.22889019548892975, "learning_rate": 2.433459707490604e-05, "loss": 0.8202, "num_input_tokens_seen": 21591504, "step": 37440 }, { "epoch": 5.577152219243372, "grad_norm": 0.21459047496318817, "learning_rate": 2.4328100556906956e-05, "loss": 0.8159, "num_input_tokens_seen": 21594672, "step": 37445 }, { "epoch": 5.577896931784331, "grad_norm": 0.16149398684501648, "learning_rate": 2.4321604084312103e-05, "loss": 0.7985, "num_input_tokens_seen": 21597712, "step": 37450 }, { "epoch": 5.57864164432529, "grad_norm": 0.22019141912460327, "learning_rate": 2.4315107657560492e-05, "loss": 0.8007, "num_input_tokens_seen": 21600656, "step": 37455 }, { "epoch": 5.57938635686625, "grad_norm": 0.2701857089996338, "learning_rate": 2.4308611277091118e-05, "loss": 0.8006, "num_input_tokens_seen": 21603312, "step": 37460 }, { "epoch": 5.580131069407209, "grad_norm": 0.19391989707946777, "learning_rate": 2.4302114943342986e-05, "loss": 0.7863, "num_input_tokens_seen": 21605872, "step": 37465 }, { "epoch": 5.580875781948168, "grad_norm": 0.2547421455383301, "learning_rate": 2.4295618656755084e-05, "loss": 0.7637, "num_input_tokens_seen": 21608624, "step": 37470 }, { "epoch": 5.581620494489127, "grad_norm": 0.2652811110019684, "learning_rate": 2.4289122417766422e-05, "loss": 0.8076, "num_input_tokens_seen": 21611408, "step": 37475 }, { "epoch": 5.582365207030087, "grad_norm": 0.23055461049079895, "learning_rate": 2.4282626226815963e-05, "loss": 0.8086, "num_input_tokens_seen": 21614192, "step": 37480 }, { "epoch": 5.583109919571045, "grad_norm": 0.2174755185842514, "learning_rate": 2.4276130084342714e-05, "loss": 0.8061, "num_input_tokens_seen": 21617200, "step": 37485 }, { "epoch": 5.583854632112005, "grad_norm": 0.19798718392848969, "learning_rate": 2.4269633990785645e-05, "loss": 0.8092, "num_input_tokens_seen": 21619984, "step": 37490 }, { "epoch": 5.584599344652964, "grad_norm": 0.2550145387649536, "learning_rate": 2.4263137946583743e-05, "loss": 0.8074, "num_input_tokens_seen": 21622832, "step": 37495 }, { "epoch": 5.5853440571939235, "grad_norm": 0.2118058055639267, "learning_rate": 2.4256641952175983e-05, "loss": 0.8084, "num_input_tokens_seen": 21625680, "step": 37500 }, { "epoch": 5.586088769734882, "grad_norm": 0.2618430554866791, "learning_rate": 2.425014600800134e-05, "loss": 0.8202, "num_input_tokens_seen": 21628336, "step": 37505 }, { "epoch": 5.586833482275842, "grad_norm": 0.16998374462127686, "learning_rate": 2.4243650114498776e-05, "loss": 0.8122, "num_input_tokens_seen": 21631280, "step": 37510 }, { "epoch": 5.587578194816801, "grad_norm": 0.16046835482120514, "learning_rate": 2.4237154272107274e-05, "loss": 0.8212, "num_input_tokens_seen": 21634224, "step": 37515 }, { "epoch": 5.58832290735776, "grad_norm": 0.15830740332603455, "learning_rate": 2.423065848126578e-05, "loss": 0.7993, "num_input_tokens_seen": 21637168, "step": 37520 }, { "epoch": 5.589067619898719, "grad_norm": 0.18302464485168457, "learning_rate": 2.4224162742413252e-05, "loss": 0.7773, "num_input_tokens_seen": 21640144, "step": 37525 }, { "epoch": 5.589812332439678, "grad_norm": 0.18461717665195465, "learning_rate": 2.421766705598865e-05, "loss": 0.8028, "num_input_tokens_seen": 21642896, "step": 37530 }, { "epoch": 5.590557044980637, "grad_norm": 0.377451092004776, "learning_rate": 2.4211171422430937e-05, "loss": 0.7851, "num_input_tokens_seen": 21646160, "step": 37535 }, { "epoch": 5.591301757521597, "grad_norm": 0.17759697139263153, "learning_rate": 2.4204675842179046e-05, "loss": 0.7824, "num_input_tokens_seen": 21648976, "step": 37540 }, { "epoch": 5.592046470062556, "grad_norm": 0.20780812203884125, "learning_rate": 2.4198180315671927e-05, "loss": 0.8051, "num_input_tokens_seen": 21651728, "step": 37545 }, { "epoch": 5.592791182603515, "grad_norm": 0.19658061861991882, "learning_rate": 2.4191684843348524e-05, "loss": 0.7938, "num_input_tokens_seen": 21654512, "step": 37550 }, { "epoch": 5.593535895144474, "grad_norm": 0.24241632223129272, "learning_rate": 2.418518942564778e-05, "loss": 0.8283, "num_input_tokens_seen": 21657296, "step": 37555 }, { "epoch": 5.594280607685434, "grad_norm": 0.22836101055145264, "learning_rate": 2.4178694063008616e-05, "loss": 0.8089, "num_input_tokens_seen": 21660176, "step": 37560 }, { "epoch": 5.595025320226393, "grad_norm": 0.21190935373306274, "learning_rate": 2.4172198755869962e-05, "loss": 0.8071, "num_input_tokens_seen": 21663440, "step": 37565 }, { "epoch": 5.595770032767351, "grad_norm": 0.19764788448810577, "learning_rate": 2.4165703504670757e-05, "loss": 0.8037, "num_input_tokens_seen": 21666224, "step": 37570 }, { "epoch": 5.596514745308311, "grad_norm": 0.17632895708084106, "learning_rate": 2.4159208309849916e-05, "loss": 0.792, "num_input_tokens_seen": 21669104, "step": 37575 }, { "epoch": 5.59725945784927, "grad_norm": 0.18408967554569244, "learning_rate": 2.4152713171846355e-05, "loss": 0.7916, "num_input_tokens_seen": 21671888, "step": 37580 }, { "epoch": 5.5980041703902295, "grad_norm": 0.1654967963695526, "learning_rate": 2.4146218091099e-05, "loss": 0.824, "num_input_tokens_seen": 21674832, "step": 37585 }, { "epoch": 5.598748882931188, "grad_norm": 0.27679142355918884, "learning_rate": 2.413972306804675e-05, "loss": 0.7968, "num_input_tokens_seen": 21677456, "step": 37590 }, { "epoch": 5.599493595472148, "grad_norm": 0.22798103094100952, "learning_rate": 2.4133228103128526e-05, "loss": 0.8089, "num_input_tokens_seen": 21680400, "step": 37595 }, { "epoch": 5.600238308013107, "grad_norm": 0.1806035190820694, "learning_rate": 2.4126733196783214e-05, "loss": 0.7943, "num_input_tokens_seen": 21683344, "step": 37600 }, { "epoch": 5.600983020554066, "grad_norm": 0.20274783670902252, "learning_rate": 2.4120238349449728e-05, "loss": 0.8352, "num_input_tokens_seen": 21686192, "step": 37605 }, { "epoch": 5.601727733095025, "grad_norm": 0.2559971511363983, "learning_rate": 2.411374356156695e-05, "loss": 0.8109, "num_input_tokens_seen": 21689008, "step": 37610 }, { "epoch": 5.602472445635985, "grad_norm": 0.2186306267976761, "learning_rate": 2.410724883357378e-05, "loss": 0.7772, "num_input_tokens_seen": 21691600, "step": 37615 }, { "epoch": 5.603217158176943, "grad_norm": 0.19187214970588684, "learning_rate": 2.4100754165909108e-05, "loss": 0.815, "num_input_tokens_seen": 21694288, "step": 37620 }, { "epoch": 5.603961870717903, "grad_norm": 0.1845647692680359, "learning_rate": 2.4094259559011813e-05, "loss": 0.797, "num_input_tokens_seen": 21697232, "step": 37625 }, { "epoch": 5.604706583258862, "grad_norm": 0.24818895757198334, "learning_rate": 2.4087765013320776e-05, "loss": 0.8138, "num_input_tokens_seen": 21699984, "step": 37630 }, { "epoch": 5.6054512957998215, "grad_norm": 0.22336556017398834, "learning_rate": 2.408127052927487e-05, "loss": 0.8278, "num_input_tokens_seen": 21702832, "step": 37635 }, { "epoch": 5.60619600834078, "grad_norm": 0.17713408172130585, "learning_rate": 2.407477610731297e-05, "loss": 0.8038, "num_input_tokens_seen": 21705680, "step": 37640 }, { "epoch": 5.60694072088174, "grad_norm": 0.33941924571990967, "learning_rate": 2.4068281747873927e-05, "loss": 0.8346, "num_input_tokens_seen": 21708624, "step": 37645 }, { "epoch": 5.607685433422699, "grad_norm": 0.22156217694282532, "learning_rate": 2.4061787451396626e-05, "loss": 0.7896, "num_input_tokens_seen": 21711344, "step": 37650 }, { "epoch": 5.608430145963658, "grad_norm": 0.2908930480480194, "learning_rate": 2.4055293218319907e-05, "loss": 0.7964, "num_input_tokens_seen": 21714672, "step": 37655 }, { "epoch": 5.609174858504617, "grad_norm": 0.18155331909656525, "learning_rate": 2.4048799049082632e-05, "loss": 0.8041, "num_input_tokens_seen": 21717616, "step": 37660 }, { "epoch": 5.609919571045577, "grad_norm": 0.2036474347114563, "learning_rate": 2.4042304944123654e-05, "loss": 0.7944, "num_input_tokens_seen": 21720464, "step": 37665 }, { "epoch": 5.6106642835865355, "grad_norm": 0.18760111927986145, "learning_rate": 2.4035810903881813e-05, "loss": 0.768, "num_input_tokens_seen": 21723312, "step": 37670 }, { "epoch": 5.611408996127495, "grad_norm": 0.22085824608802795, "learning_rate": 2.4029316928795958e-05, "loss": 0.8129, "num_input_tokens_seen": 21726384, "step": 37675 }, { "epoch": 5.612153708668454, "grad_norm": 0.14021745324134827, "learning_rate": 2.402282301930491e-05, "loss": 0.7947, "num_input_tokens_seen": 21729296, "step": 37680 }, { "epoch": 5.6128984212094135, "grad_norm": 0.20865730941295624, "learning_rate": 2.4016329175847514e-05, "loss": 0.7731, "num_input_tokens_seen": 21732208, "step": 37685 }, { "epoch": 5.613643133750372, "grad_norm": 0.18909543752670288, "learning_rate": 2.4009835398862588e-05, "loss": 0.8188, "num_input_tokens_seen": 21734960, "step": 37690 }, { "epoch": 5.614387846291332, "grad_norm": 0.2104504406452179, "learning_rate": 2.4003341688788958e-05, "loss": 0.7923, "num_input_tokens_seen": 21737744, "step": 37695 }, { "epoch": 5.615132558832291, "grad_norm": 0.20489738881587982, "learning_rate": 2.399684804606545e-05, "loss": 0.8089, "num_input_tokens_seen": 21740592, "step": 37700 }, { "epoch": 5.61587727137325, "grad_norm": 0.17720822989940643, "learning_rate": 2.3990354471130873e-05, "loss": 0.7958, "num_input_tokens_seen": 21743440, "step": 37705 }, { "epoch": 5.616621983914209, "grad_norm": 0.21925075352191925, "learning_rate": 2.398386096442403e-05, "loss": 0.7949, "num_input_tokens_seen": 21746128, "step": 37710 }, { "epoch": 5.617366696455168, "grad_norm": 0.19560573995113373, "learning_rate": 2.3977367526383744e-05, "loss": 0.7771, "num_input_tokens_seen": 21749136, "step": 37715 }, { "epoch": 5.6181114089961275, "grad_norm": 0.16496965289115906, "learning_rate": 2.39708741574488e-05, "loss": 0.7902, "num_input_tokens_seen": 21751920, "step": 37720 }, { "epoch": 5.618856121537087, "grad_norm": 0.20827816426753998, "learning_rate": 2.3964380858057985e-05, "loss": 0.7654, "num_input_tokens_seen": 21754960, "step": 37725 }, { "epoch": 5.619600834078046, "grad_norm": 0.19035519659519196, "learning_rate": 2.3957887628650104e-05, "loss": 0.7798, "num_input_tokens_seen": 21757712, "step": 37730 }, { "epoch": 5.620345546619005, "grad_norm": 0.20223701000213623, "learning_rate": 2.3951394469663946e-05, "loss": 0.7768, "num_input_tokens_seen": 21760976, "step": 37735 }, { "epoch": 5.621090259159964, "grad_norm": 0.2336772382259369, "learning_rate": 2.394490138153828e-05, "loss": 0.8203, "num_input_tokens_seen": 21763952, "step": 37740 }, { "epoch": 5.621834971700923, "grad_norm": 0.142158642411232, "learning_rate": 2.393840836471189e-05, "loss": 0.7988, "num_input_tokens_seen": 21766928, "step": 37745 }, { "epoch": 5.622579684241883, "grad_norm": 0.2016066312789917, "learning_rate": 2.3931915419623552e-05, "loss": 0.7759, "num_input_tokens_seen": 21769904, "step": 37750 }, { "epoch": 5.6233243967828415, "grad_norm": 0.20767949521541595, "learning_rate": 2.3925422546712032e-05, "loss": 0.7746, "num_input_tokens_seen": 21772720, "step": 37755 }, { "epoch": 5.624069109323801, "grad_norm": 0.1756974160671234, "learning_rate": 2.3918929746416077e-05, "loss": 0.8052, "num_input_tokens_seen": 21775568, "step": 37760 }, { "epoch": 5.62481382186476, "grad_norm": 0.2347211241722107, "learning_rate": 2.3912437019174454e-05, "loss": 0.8061, "num_input_tokens_seen": 21778512, "step": 37765 }, { "epoch": 5.6255585344057195, "grad_norm": 0.19423963129520416, "learning_rate": 2.3905944365425922e-05, "loss": 0.8254, "num_input_tokens_seen": 21781360, "step": 37770 }, { "epoch": 5.626303246946678, "grad_norm": 0.21700561046600342, "learning_rate": 2.3899451785609218e-05, "loss": 0.8244, "num_input_tokens_seen": 21784464, "step": 37775 }, { "epoch": 5.627047959487638, "grad_norm": 0.16319233179092407, "learning_rate": 2.3892959280163084e-05, "loss": 0.8014, "num_input_tokens_seen": 21787312, "step": 37780 }, { "epoch": 5.627792672028597, "grad_norm": 0.24520978331565857, "learning_rate": 2.388646684952627e-05, "loss": 0.8409, "num_input_tokens_seen": 21790256, "step": 37785 }, { "epoch": 5.628537384569556, "grad_norm": 0.14184483885765076, "learning_rate": 2.3879974494137487e-05, "loss": 0.8223, "num_input_tokens_seen": 21793008, "step": 37790 }, { "epoch": 5.629282097110515, "grad_norm": 0.2109685093164444, "learning_rate": 2.3873482214435486e-05, "loss": 0.8048, "num_input_tokens_seen": 21795984, "step": 37795 }, { "epoch": 5.630026809651475, "grad_norm": 0.18225209414958954, "learning_rate": 2.3866990010858976e-05, "loss": 0.7688, "num_input_tokens_seen": 21798960, "step": 37800 }, { "epoch": 5.6307715221924335, "grad_norm": 0.21717001497745514, "learning_rate": 2.386049788384667e-05, "loss": 0.7856, "num_input_tokens_seen": 21801744, "step": 37805 }, { "epoch": 5.631516234733393, "grad_norm": 0.24012663960456848, "learning_rate": 2.3854005833837285e-05, "loss": 0.7998, "num_input_tokens_seen": 21804560, "step": 37810 }, { "epoch": 5.632260947274352, "grad_norm": 0.18158510327339172, "learning_rate": 2.384751386126953e-05, "loss": 0.7513, "num_input_tokens_seen": 21807344, "step": 37815 }, { "epoch": 5.6330056598153115, "grad_norm": 0.24334651231765747, "learning_rate": 2.3841021966582095e-05, "loss": 0.7782, "num_input_tokens_seen": 21810032, "step": 37820 }, { "epoch": 5.63375037235627, "grad_norm": 0.3058164119720459, "learning_rate": 2.3834530150213686e-05, "loss": 0.8204, "num_input_tokens_seen": 21812816, "step": 37825 }, { "epoch": 5.63449508489723, "grad_norm": 0.19511273503303528, "learning_rate": 2.3828038412602993e-05, "loss": 0.7814, "num_input_tokens_seen": 21815312, "step": 37830 }, { "epoch": 5.635239797438189, "grad_norm": 0.19666409492492676, "learning_rate": 2.3821546754188698e-05, "loss": 0.8074, "num_input_tokens_seen": 21818256, "step": 37835 }, { "epoch": 5.635984509979148, "grad_norm": 0.1936599314212799, "learning_rate": 2.381505517540949e-05, "loss": 0.8149, "num_input_tokens_seen": 21820976, "step": 37840 }, { "epoch": 5.636729222520107, "grad_norm": 0.2020179182291031, "learning_rate": 2.3808563676704027e-05, "loss": 0.8143, "num_input_tokens_seen": 21823984, "step": 37845 }, { "epoch": 5.637473935061067, "grad_norm": 0.24760043621063232, "learning_rate": 2.3802072258510986e-05, "loss": 0.8076, "num_input_tokens_seen": 21827120, "step": 37850 }, { "epoch": 5.6382186476020255, "grad_norm": 0.29986149072647095, "learning_rate": 2.3795580921269034e-05, "loss": 0.8271, "num_input_tokens_seen": 21830000, "step": 37855 }, { "epoch": 5.638963360142985, "grad_norm": 0.2766496241092682, "learning_rate": 2.378908966541682e-05, "loss": 0.8005, "num_input_tokens_seen": 21832880, "step": 37860 }, { "epoch": 5.639708072683944, "grad_norm": 0.21945755183696747, "learning_rate": 2.3782598491393014e-05, "loss": 0.8015, "num_input_tokens_seen": 21835792, "step": 37865 }, { "epoch": 5.640452785224904, "grad_norm": 0.24193502962589264, "learning_rate": 2.3776107399636247e-05, "loss": 0.8078, "num_input_tokens_seen": 21838608, "step": 37870 }, { "epoch": 5.641197497765862, "grad_norm": 0.19619597494602203, "learning_rate": 2.376961639058516e-05, "loss": 0.81, "num_input_tokens_seen": 21841264, "step": 37875 }, { "epoch": 5.641942210306821, "grad_norm": 0.21362587809562683, "learning_rate": 2.3763125464678414e-05, "loss": 0.7798, "num_input_tokens_seen": 21844336, "step": 37880 }, { "epoch": 5.642686922847781, "grad_norm": 0.17149190604686737, "learning_rate": 2.3756634622354607e-05, "loss": 0.7876, "num_input_tokens_seen": 21847056, "step": 37885 }, { "epoch": 5.64343163538874, "grad_norm": 0.1823127418756485, "learning_rate": 2.3750143864052376e-05, "loss": 0.7892, "num_input_tokens_seen": 21849680, "step": 37890 }, { "epoch": 5.644176347929699, "grad_norm": 0.24530752003192902, "learning_rate": 2.374365319021034e-05, "loss": 0.7846, "num_input_tokens_seen": 21852432, "step": 37895 }, { "epoch": 5.644921060470658, "grad_norm": 0.24414227902889252, "learning_rate": 2.373716260126712e-05, "loss": 0.8271, "num_input_tokens_seen": 21855184, "step": 37900 }, { "epoch": 5.6456657730116175, "grad_norm": 0.2539825737476349, "learning_rate": 2.373067209766131e-05, "loss": 0.7765, "num_input_tokens_seen": 21857936, "step": 37905 }, { "epoch": 5.646410485552577, "grad_norm": 0.23490193486213684, "learning_rate": 2.372418167983152e-05, "loss": 0.8037, "num_input_tokens_seen": 21860752, "step": 37910 }, { "epoch": 5.647155198093536, "grad_norm": 0.21060612797737122, "learning_rate": 2.371769134821635e-05, "loss": 0.8333, "num_input_tokens_seen": 21863728, "step": 37915 }, { "epoch": 5.647899910634495, "grad_norm": 0.2066771537065506, "learning_rate": 2.371120110325439e-05, "loss": 0.7898, "num_input_tokens_seen": 21866768, "step": 37920 }, { "epoch": 5.648644623175454, "grad_norm": 0.1943744421005249, "learning_rate": 2.370471094538421e-05, "loss": 0.8149, "num_input_tokens_seen": 21869904, "step": 37925 }, { "epoch": 5.649389335716413, "grad_norm": 0.2490803301334381, "learning_rate": 2.3698220875044396e-05, "loss": 0.8017, "num_input_tokens_seen": 21872784, "step": 37930 }, { "epoch": 5.650134048257373, "grad_norm": 0.20945067703723907, "learning_rate": 2.369173089267353e-05, "loss": 0.7675, "num_input_tokens_seen": 21875472, "step": 37935 }, { "epoch": 5.6508787607983315, "grad_norm": 0.25726139545440674, "learning_rate": 2.3685240998710166e-05, "loss": 0.7387, "num_input_tokens_seen": 21878512, "step": 37940 }, { "epoch": 5.651623473339291, "grad_norm": 0.23404541611671448, "learning_rate": 2.367875119359287e-05, "loss": 0.8187, "num_input_tokens_seen": 21881264, "step": 37945 }, { "epoch": 5.65236818588025, "grad_norm": 0.23071885108947754, "learning_rate": 2.36722614777602e-05, "loss": 0.8044, "num_input_tokens_seen": 21883760, "step": 37950 }, { "epoch": 5.65311289842121, "grad_norm": 0.1863296777009964, "learning_rate": 2.3665771851650697e-05, "loss": 0.8305, "num_input_tokens_seen": 21886480, "step": 37955 }, { "epoch": 5.653857610962168, "grad_norm": 0.22292160987854004, "learning_rate": 2.3659282315702918e-05, "loss": 0.7893, "num_input_tokens_seen": 21889456, "step": 37960 }, { "epoch": 5.654602323503128, "grad_norm": 0.2169644832611084, "learning_rate": 2.365279287035538e-05, "loss": 0.7865, "num_input_tokens_seen": 21892080, "step": 37965 }, { "epoch": 5.655347036044087, "grad_norm": 0.2539254128932953, "learning_rate": 2.3646303516046626e-05, "loss": 0.8048, "num_input_tokens_seen": 21895152, "step": 37970 }, { "epoch": 5.656091748585046, "grad_norm": 0.22443759441375732, "learning_rate": 2.363981425321517e-05, "loss": 0.8467, "num_input_tokens_seen": 21898320, "step": 37975 }, { "epoch": 5.656836461126005, "grad_norm": 0.2999298870563507, "learning_rate": 2.3633325082299545e-05, "loss": 0.8229, "num_input_tokens_seen": 21901040, "step": 37980 }, { "epoch": 5.657581173666965, "grad_norm": 0.23347172141075134, "learning_rate": 2.362683600373825e-05, "loss": 0.8202, "num_input_tokens_seen": 21903664, "step": 37985 }, { "epoch": 5.6583258862079235, "grad_norm": 0.31937000155448914, "learning_rate": 2.362034701796979e-05, "loss": 0.8516, "num_input_tokens_seen": 21906416, "step": 37990 }, { "epoch": 5.659070598748883, "grad_norm": 0.21967440843582153, "learning_rate": 2.3613858125432677e-05, "loss": 0.8316, "num_input_tokens_seen": 21908880, "step": 37995 }, { "epoch": 5.659815311289842, "grad_norm": 0.2680971920490265, "learning_rate": 2.3607369326565403e-05, "loss": 0.8144, "num_input_tokens_seen": 21911728, "step": 38000 }, { "epoch": 5.660560023830802, "grad_norm": 0.2545938491821289, "learning_rate": 2.3600880621806438e-05, "loss": 0.7858, "num_input_tokens_seen": 21914736, "step": 38005 }, { "epoch": 5.66130473637176, "grad_norm": 0.19290421903133392, "learning_rate": 2.359439201159427e-05, "loss": 0.7806, "num_input_tokens_seen": 21917424, "step": 38010 }, { "epoch": 5.66204944891272, "grad_norm": 0.22965562343597412, "learning_rate": 2.3587903496367382e-05, "loss": 0.7881, "num_input_tokens_seen": 21920432, "step": 38015 }, { "epoch": 5.662794161453679, "grad_norm": 0.19279609620571136, "learning_rate": 2.3581415076564225e-05, "loss": 0.8361, "num_input_tokens_seen": 21923600, "step": 38020 }, { "epoch": 5.663538873994638, "grad_norm": 0.23676900565624237, "learning_rate": 2.3574926752623276e-05, "loss": 0.7917, "num_input_tokens_seen": 21926480, "step": 38025 }, { "epoch": 5.664283586535597, "grad_norm": 0.2466031163930893, "learning_rate": 2.3568438524982984e-05, "loss": 0.8322, "num_input_tokens_seen": 21929616, "step": 38030 }, { "epoch": 5.665028299076557, "grad_norm": 0.19854843616485596, "learning_rate": 2.3561950394081793e-05, "loss": 0.8167, "num_input_tokens_seen": 21932400, "step": 38035 }, { "epoch": 5.665773011617516, "grad_norm": 0.22540892660617828, "learning_rate": 2.3555462360358154e-05, "loss": 0.8224, "num_input_tokens_seen": 21935408, "step": 38040 }, { "epoch": 5.666517724158475, "grad_norm": 0.16786104440689087, "learning_rate": 2.3548974424250492e-05, "loss": 0.8017, "num_input_tokens_seen": 21938096, "step": 38045 }, { "epoch": 5.667262436699434, "grad_norm": 0.27016499638557434, "learning_rate": 2.3542486586197237e-05, "loss": 0.786, "num_input_tokens_seen": 21941040, "step": 38050 }, { "epoch": 5.668007149240394, "grad_norm": 0.22263164818286896, "learning_rate": 2.3535998846636815e-05, "loss": 0.8021, "num_input_tokens_seen": 21943472, "step": 38055 }, { "epoch": 5.668751861781352, "grad_norm": 0.23870162665843964, "learning_rate": 2.352951120600763e-05, "loss": 0.8051, "num_input_tokens_seen": 21946352, "step": 38060 }, { "epoch": 5.669496574322311, "grad_norm": 0.36915528774261475, "learning_rate": 2.352302366474811e-05, "loss": 0.7655, "num_input_tokens_seen": 21948976, "step": 38065 }, { "epoch": 5.670241286863271, "grad_norm": 0.1711731255054474, "learning_rate": 2.351653622329664e-05, "loss": 0.8126, "num_input_tokens_seen": 21951824, "step": 38070 }, { "epoch": 5.67098599940423, "grad_norm": 0.21556688845157623, "learning_rate": 2.351004888209162e-05, "loss": 0.8162, "num_input_tokens_seen": 21954736, "step": 38075 }, { "epoch": 5.671730711945189, "grad_norm": 0.25982773303985596, "learning_rate": 2.3503561641571455e-05, "loss": 0.7988, "num_input_tokens_seen": 21957360, "step": 38080 }, { "epoch": 5.672475424486148, "grad_norm": 0.21199001371860504, "learning_rate": 2.3497074502174495e-05, "loss": 0.7778, "num_input_tokens_seen": 21959984, "step": 38085 }, { "epoch": 5.673220137027108, "grad_norm": 0.21235670149326324, "learning_rate": 2.349058746433913e-05, "loss": 0.8083, "num_input_tokens_seen": 21962768, "step": 38090 }, { "epoch": 5.673964849568066, "grad_norm": 0.22619280219078064, "learning_rate": 2.348410052850373e-05, "loss": 0.8009, "num_input_tokens_seen": 21965808, "step": 38095 }, { "epoch": 5.674709562109026, "grad_norm": 0.20970873534679413, "learning_rate": 2.347761369510665e-05, "loss": 0.7939, "num_input_tokens_seen": 21968688, "step": 38100 }, { "epoch": 5.675454274649985, "grad_norm": 0.2001921534538269, "learning_rate": 2.3471126964586247e-05, "loss": 0.7812, "num_input_tokens_seen": 21971664, "step": 38105 }, { "epoch": 5.676198987190944, "grad_norm": 0.32447826862335205, "learning_rate": 2.3464640337380868e-05, "loss": 0.8127, "num_input_tokens_seen": 21974800, "step": 38110 }, { "epoch": 5.676943699731903, "grad_norm": 0.22924484312534332, "learning_rate": 2.3458153813928857e-05, "loss": 0.7925, "num_input_tokens_seen": 21977584, "step": 38115 }, { "epoch": 5.677688412272863, "grad_norm": 0.16382066905498505, "learning_rate": 2.345166739466855e-05, "loss": 0.7958, "num_input_tokens_seen": 21980656, "step": 38120 }, { "epoch": 5.678433124813822, "grad_norm": 0.3624574840068817, "learning_rate": 2.344518108003825e-05, "loss": 0.82, "num_input_tokens_seen": 21983888, "step": 38125 }, { "epoch": 5.679177837354781, "grad_norm": 0.17953231930732727, "learning_rate": 2.3438694870476295e-05, "loss": 0.798, "num_input_tokens_seen": 21986800, "step": 38130 }, { "epoch": 5.67992254989574, "grad_norm": 0.2008824646472931, "learning_rate": 2.3432208766421e-05, "loss": 0.8138, "num_input_tokens_seen": 21989680, "step": 38135 }, { "epoch": 5.6806672624367, "grad_norm": 0.20705097913742065, "learning_rate": 2.3425722768310652e-05, "loss": 0.7856, "num_input_tokens_seen": 21992560, "step": 38140 }, { "epoch": 5.681411974977658, "grad_norm": 0.19737765192985535, "learning_rate": 2.3419236876583568e-05, "loss": 0.8045, "num_input_tokens_seen": 21995440, "step": 38145 }, { "epoch": 5.682156687518618, "grad_norm": 0.21166351437568665, "learning_rate": 2.341275109167802e-05, "loss": 0.8201, "num_input_tokens_seen": 21998416, "step": 38150 }, { "epoch": 5.682901400059577, "grad_norm": 0.2787820100784302, "learning_rate": 2.34062654140323e-05, "loss": 0.7964, "num_input_tokens_seen": 22001424, "step": 38155 }, { "epoch": 5.683646112600536, "grad_norm": 0.2166125327348709, "learning_rate": 2.33997798440847e-05, "loss": 0.8155, "num_input_tokens_seen": 22004336, "step": 38160 }, { "epoch": 5.684390825141495, "grad_norm": 0.22090214490890503, "learning_rate": 2.3393294382273462e-05, "loss": 0.8077, "num_input_tokens_seen": 22007024, "step": 38165 }, { "epoch": 5.685135537682455, "grad_norm": 0.2412284016609192, "learning_rate": 2.338680902903685e-05, "loss": 0.7823, "num_input_tokens_seen": 22010032, "step": 38170 }, { "epoch": 5.685880250223414, "grad_norm": 0.18516677618026733, "learning_rate": 2.338032378481313e-05, "loss": 0.8205, "num_input_tokens_seen": 22013040, "step": 38175 }, { "epoch": 5.686624962764373, "grad_norm": 0.15866248309612274, "learning_rate": 2.3373838650040548e-05, "loss": 0.7749, "num_input_tokens_seen": 22016272, "step": 38180 }, { "epoch": 5.687369675305332, "grad_norm": 0.21272695064544678, "learning_rate": 2.3367353625157333e-05, "loss": 0.7846, "num_input_tokens_seen": 22019056, "step": 38185 }, { "epoch": 5.688114387846292, "grad_norm": 0.21347922086715698, "learning_rate": 2.3360868710601717e-05, "loss": 0.7977, "num_input_tokens_seen": 22021936, "step": 38190 }, { "epoch": 5.68885910038725, "grad_norm": 0.24751102924346924, "learning_rate": 2.335438390681194e-05, "loss": 0.7871, "num_input_tokens_seen": 22025072, "step": 38195 }, { "epoch": 5.68960381292821, "grad_norm": 0.16890603303909302, "learning_rate": 2.3347899214226214e-05, "loss": 0.794, "num_input_tokens_seen": 22027760, "step": 38200 }, { "epoch": 5.690348525469169, "grad_norm": 0.22291529178619385, "learning_rate": 2.334141463328273e-05, "loss": 0.838, "num_input_tokens_seen": 22030512, "step": 38205 }, { "epoch": 5.6910932380101285, "grad_norm": 0.20927904546260834, "learning_rate": 2.33349301644197e-05, "loss": 0.798, "num_input_tokens_seen": 22033616, "step": 38210 }, { "epoch": 5.691837950551087, "grad_norm": 0.17195433378219604, "learning_rate": 2.332844580807533e-05, "loss": 0.8329, "num_input_tokens_seen": 22036400, "step": 38215 }, { "epoch": 5.692582663092047, "grad_norm": 0.20645596086978912, "learning_rate": 2.3321961564687787e-05, "loss": 0.8063, "num_input_tokens_seen": 22038864, "step": 38220 }, { "epoch": 5.693327375633006, "grad_norm": 0.2983483672142029, "learning_rate": 2.3315477434695256e-05, "loss": 0.8336, "num_input_tokens_seen": 22041744, "step": 38225 }, { "epoch": 5.694072088173964, "grad_norm": 0.28790587186813354, "learning_rate": 2.3308993418535924e-05, "loss": 0.7803, "num_input_tokens_seen": 22044784, "step": 38230 }, { "epoch": 5.694816800714924, "grad_norm": 0.1948166787624359, "learning_rate": 2.330250951664793e-05, "loss": 0.8017, "num_input_tokens_seen": 22047536, "step": 38235 }, { "epoch": 5.695561513255884, "grad_norm": 0.2121284455060959, "learning_rate": 2.3296025729469457e-05, "loss": 0.8205, "num_input_tokens_seen": 22050640, "step": 38240 }, { "epoch": 5.696306225796842, "grad_norm": 0.2936781942844391, "learning_rate": 2.3289542057438625e-05, "loss": 0.7917, "num_input_tokens_seen": 22053328, "step": 38245 }, { "epoch": 5.697050938337801, "grad_norm": 0.26957204937934875, "learning_rate": 2.3283058500993587e-05, "loss": 0.786, "num_input_tokens_seen": 22056368, "step": 38250 }, { "epoch": 5.697795650878761, "grad_norm": 0.2242647260427475, "learning_rate": 2.3276575060572476e-05, "loss": 0.8006, "num_input_tokens_seen": 22059472, "step": 38255 }, { "epoch": 5.6985403634197205, "grad_norm": 0.16112396121025085, "learning_rate": 2.3270091736613412e-05, "loss": 0.8173, "num_input_tokens_seen": 22062192, "step": 38260 }, { "epoch": 5.699285075960679, "grad_norm": 0.17312677204608917, "learning_rate": 2.326360852955452e-05, "loss": 0.8011, "num_input_tokens_seen": 22064976, "step": 38265 }, { "epoch": 5.700029788501638, "grad_norm": 0.26946020126342773, "learning_rate": 2.3257125439833902e-05, "loss": 0.8001, "num_input_tokens_seen": 22068208, "step": 38270 }, { "epoch": 5.700774501042598, "grad_norm": 0.23349635303020477, "learning_rate": 2.325064246788966e-05, "loss": 0.7684, "num_input_tokens_seen": 22071344, "step": 38275 }, { "epoch": 5.701519213583556, "grad_norm": 0.17876197397708893, "learning_rate": 2.3244159614159898e-05, "loss": 0.8079, "num_input_tokens_seen": 22074192, "step": 38280 }, { "epoch": 5.702263926124516, "grad_norm": 0.2593783140182495, "learning_rate": 2.3237676879082682e-05, "loss": 0.8233, "num_input_tokens_seen": 22077168, "step": 38285 }, { "epoch": 5.703008638665475, "grad_norm": 0.2475227564573288, "learning_rate": 2.3231194263096096e-05, "loss": 0.7857, "num_input_tokens_seen": 22079664, "step": 38290 }, { "epoch": 5.7037533512064345, "grad_norm": 0.2587258815765381, "learning_rate": 2.322471176663821e-05, "loss": 0.8201, "num_input_tokens_seen": 22082640, "step": 38295 }, { "epoch": 5.704498063747393, "grad_norm": 0.29461899399757385, "learning_rate": 2.3218229390147086e-05, "loss": 0.7948, "num_input_tokens_seen": 22085328, "step": 38300 }, { "epoch": 5.705242776288353, "grad_norm": 0.19079989194869995, "learning_rate": 2.3211747134060774e-05, "loss": 0.7781, "num_input_tokens_seen": 22088208, "step": 38305 }, { "epoch": 5.705987488829312, "grad_norm": 0.2260117381811142, "learning_rate": 2.3205264998817326e-05, "loss": 0.7862, "num_input_tokens_seen": 22090992, "step": 38310 }, { "epoch": 5.706732201370271, "grad_norm": 0.19701793789863586, "learning_rate": 2.3198782984854765e-05, "loss": 0.7853, "num_input_tokens_seen": 22093904, "step": 38315 }, { "epoch": 5.70747691391123, "grad_norm": 0.3848586678504944, "learning_rate": 2.3192301092611138e-05, "loss": 0.8148, "num_input_tokens_seen": 22097008, "step": 38320 }, { "epoch": 5.70822162645219, "grad_norm": 0.20935316383838654, "learning_rate": 2.3185819322524443e-05, "loss": 0.7955, "num_input_tokens_seen": 22100080, "step": 38325 }, { "epoch": 5.708966338993148, "grad_norm": 0.18801334500312805, "learning_rate": 2.3179337675032707e-05, "loss": 0.7869, "num_input_tokens_seen": 22102672, "step": 38330 }, { "epoch": 5.709711051534108, "grad_norm": 0.17765803635120392, "learning_rate": 2.3172856150573926e-05, "loss": 0.7741, "num_input_tokens_seen": 22105616, "step": 38335 }, { "epoch": 5.710455764075067, "grad_norm": 0.2266005128622055, "learning_rate": 2.3166374749586094e-05, "loss": 0.8221, "num_input_tokens_seen": 22108752, "step": 38340 }, { "epoch": 5.7112004766160265, "grad_norm": 0.24778568744659424, "learning_rate": 2.3159893472507212e-05, "loss": 0.7887, "num_input_tokens_seen": 22111600, "step": 38345 }, { "epoch": 5.711945189156985, "grad_norm": 0.15379957854747772, "learning_rate": 2.315341231977524e-05, "loss": 0.7978, "num_input_tokens_seen": 22114384, "step": 38350 }, { "epoch": 5.712689901697945, "grad_norm": 0.2848481237888336, "learning_rate": 2.314693129182815e-05, "loss": 0.8058, "num_input_tokens_seen": 22116944, "step": 38355 }, { "epoch": 5.713434614238904, "grad_norm": 0.19663161039352417, "learning_rate": 2.314045038910393e-05, "loss": 0.8186, "num_input_tokens_seen": 22119760, "step": 38360 }, { "epoch": 5.714179326779863, "grad_norm": 0.2255353033542633, "learning_rate": 2.3133969612040503e-05, "loss": 0.8218, "num_input_tokens_seen": 22122928, "step": 38365 }, { "epoch": 5.714924039320822, "grad_norm": 0.20310546457767487, "learning_rate": 2.3127488961075812e-05, "loss": 0.7959, "num_input_tokens_seen": 22125616, "step": 38370 }, { "epoch": 5.715668751861782, "grad_norm": 0.403033584356308, "learning_rate": 2.312100843664781e-05, "loss": 0.8504, "num_input_tokens_seen": 22128528, "step": 38375 }, { "epoch": 5.7164134644027405, "grad_norm": 0.18438173830509186, "learning_rate": 2.311452803919442e-05, "loss": 0.8043, "num_input_tokens_seen": 22131664, "step": 38380 }, { "epoch": 5.7171581769437, "grad_norm": 0.2215699553489685, "learning_rate": 2.3108047769153558e-05, "loss": 0.8143, "num_input_tokens_seen": 22134736, "step": 38385 }, { "epoch": 5.717902889484659, "grad_norm": 0.21606701612472534, "learning_rate": 2.3101567626963138e-05, "loss": 0.8064, "num_input_tokens_seen": 22137552, "step": 38390 }, { "epoch": 5.718647602025618, "grad_norm": 0.20157743990421295, "learning_rate": 2.3095087613061058e-05, "loss": 0.7869, "num_input_tokens_seen": 22140400, "step": 38395 }, { "epoch": 5.719392314566577, "grad_norm": 0.257904976606369, "learning_rate": 2.3088607727885207e-05, "loss": 0.7805, "num_input_tokens_seen": 22143248, "step": 38400 }, { "epoch": 5.720137027107537, "grad_norm": 0.1676006317138672, "learning_rate": 2.3082127971873492e-05, "loss": 0.8002, "num_input_tokens_seen": 22146160, "step": 38405 }, { "epoch": 5.720881739648496, "grad_norm": 0.2062356024980545, "learning_rate": 2.3075648345463754e-05, "loss": 0.8088, "num_input_tokens_seen": 22148720, "step": 38410 }, { "epoch": 5.721626452189454, "grad_norm": 0.18536926805973053, "learning_rate": 2.3069168849093885e-05, "loss": 0.7945, "num_input_tokens_seen": 22151504, "step": 38415 }, { "epoch": 5.722371164730414, "grad_norm": 0.18414196372032166, "learning_rate": 2.306268948320173e-05, "loss": 0.8233, "num_input_tokens_seen": 22154224, "step": 38420 }, { "epoch": 5.723115877271374, "grad_norm": 0.20015345513820648, "learning_rate": 2.305621024822514e-05, "loss": 0.8135, "num_input_tokens_seen": 22157072, "step": 38425 }, { "epoch": 5.7238605898123325, "grad_norm": 0.16021384298801422, "learning_rate": 2.3049731144601967e-05, "loss": 0.8147, "num_input_tokens_seen": 22159568, "step": 38430 }, { "epoch": 5.724605302353291, "grad_norm": 0.19981306791305542, "learning_rate": 2.3043252172770027e-05, "loss": 0.7955, "num_input_tokens_seen": 22162512, "step": 38435 }, { "epoch": 5.725350014894251, "grad_norm": 0.19520217180252075, "learning_rate": 2.303677333316715e-05, "loss": 0.8246, "num_input_tokens_seen": 22165840, "step": 38440 }, { "epoch": 5.72609472743521, "grad_norm": 0.1774706244468689, "learning_rate": 2.3030294626231162e-05, "loss": 0.7928, "num_input_tokens_seen": 22168976, "step": 38445 }, { "epoch": 5.726839439976169, "grad_norm": 0.18805480003356934, "learning_rate": 2.302381605239985e-05, "loss": 0.7847, "num_input_tokens_seen": 22171760, "step": 38450 }, { "epoch": 5.727584152517128, "grad_norm": 0.1893829107284546, "learning_rate": 2.3017337612111007e-05, "loss": 0.7973, "num_input_tokens_seen": 22174736, "step": 38455 }, { "epoch": 5.728328865058088, "grad_norm": 0.2845372259616852, "learning_rate": 2.3010859305802426e-05, "loss": 0.8266, "num_input_tokens_seen": 22177680, "step": 38460 }, { "epoch": 5.7290735775990465, "grad_norm": 0.26528069376945496, "learning_rate": 2.300438113391189e-05, "loss": 0.8123, "num_input_tokens_seen": 22180656, "step": 38465 }, { "epoch": 5.729818290140006, "grad_norm": 0.20765095949172974, "learning_rate": 2.2997903096877164e-05, "loss": 0.8011, "num_input_tokens_seen": 22183600, "step": 38470 }, { "epoch": 5.730563002680965, "grad_norm": 0.19649899005889893, "learning_rate": 2.299142519513601e-05, "loss": 0.8181, "num_input_tokens_seen": 22186544, "step": 38475 }, { "epoch": 5.7313077152219245, "grad_norm": 0.16903747618198395, "learning_rate": 2.298494742912617e-05, "loss": 0.7985, "num_input_tokens_seen": 22189136, "step": 38480 }, { "epoch": 5.732052427762883, "grad_norm": 0.2026386708021164, "learning_rate": 2.2978469799285397e-05, "loss": 0.804, "num_input_tokens_seen": 22192208, "step": 38485 }, { "epoch": 5.732797140303843, "grad_norm": 0.2622200846672058, "learning_rate": 2.297199230605141e-05, "loss": 0.8298, "num_input_tokens_seen": 22195120, "step": 38490 }, { "epoch": 5.733541852844802, "grad_norm": 0.22019633650779724, "learning_rate": 2.2965514949861938e-05, "loss": 0.8132, "num_input_tokens_seen": 22197808, "step": 38495 }, { "epoch": 5.734286565385761, "grad_norm": 0.2756539583206177, "learning_rate": 2.2959037731154692e-05, "loss": 0.8179, "num_input_tokens_seen": 22200848, "step": 38500 }, { "epoch": 5.73503127792672, "grad_norm": 0.25571832060813904, "learning_rate": 2.295256065036738e-05, "loss": 0.7899, "num_input_tokens_seen": 22203408, "step": 38505 }, { "epoch": 5.73577599046768, "grad_norm": 0.2197304517030716, "learning_rate": 2.2946083707937697e-05, "loss": 0.7861, "num_input_tokens_seen": 22206640, "step": 38510 }, { "epoch": 5.7365207030086385, "grad_norm": 0.23509038984775543, "learning_rate": 2.293960690430332e-05, "loss": 0.82, "num_input_tokens_seen": 22209488, "step": 38515 }, { "epoch": 5.737265415549598, "grad_norm": 0.23182162642478943, "learning_rate": 2.2933130239901934e-05, "loss": 0.7812, "num_input_tokens_seen": 22212528, "step": 38520 }, { "epoch": 5.738010128090557, "grad_norm": 0.20687858760356903, "learning_rate": 2.2926653715171215e-05, "loss": 0.8106, "num_input_tokens_seen": 22215632, "step": 38525 }, { "epoch": 5.7387548406315165, "grad_norm": 0.20310814678668976, "learning_rate": 2.2920177330548802e-05, "loss": 0.8178, "num_input_tokens_seen": 22218480, "step": 38530 }, { "epoch": 5.739499553172475, "grad_norm": 0.277693510055542, "learning_rate": 2.2913701086472343e-05, "loss": 0.7878, "num_input_tokens_seen": 22221392, "step": 38535 }, { "epoch": 5.740244265713435, "grad_norm": 0.1783818155527115, "learning_rate": 2.290722498337948e-05, "loss": 0.8079, "num_input_tokens_seen": 22224048, "step": 38540 }, { "epoch": 5.740988978254394, "grad_norm": 0.26804476976394653, "learning_rate": 2.2900749021707855e-05, "loss": 0.7974, "num_input_tokens_seen": 22226768, "step": 38545 }, { "epoch": 5.741733690795353, "grad_norm": 0.1936316043138504, "learning_rate": 2.2894273201895068e-05, "loss": 0.7908, "num_input_tokens_seen": 22229520, "step": 38550 }, { "epoch": 5.742478403336312, "grad_norm": 0.19882604479789734, "learning_rate": 2.2887797524378734e-05, "loss": 0.8125, "num_input_tokens_seen": 22232432, "step": 38555 }, { "epoch": 5.743223115877272, "grad_norm": 0.13247625529766083, "learning_rate": 2.2881321989596464e-05, "loss": 0.8249, "num_input_tokens_seen": 22235312, "step": 38560 }, { "epoch": 5.7439678284182305, "grad_norm": 0.2637694180011749, "learning_rate": 2.2874846597985842e-05, "loss": 0.8106, "num_input_tokens_seen": 22238288, "step": 38565 }, { "epoch": 5.74471254095919, "grad_norm": 0.21139448881149292, "learning_rate": 2.2868371349984442e-05, "loss": 0.8288, "num_input_tokens_seen": 22240816, "step": 38570 }, { "epoch": 5.745457253500149, "grad_norm": 0.16793109476566315, "learning_rate": 2.2861896246029835e-05, "loss": 0.7968, "num_input_tokens_seen": 22243792, "step": 38575 }, { "epoch": 5.746201966041108, "grad_norm": 0.20600147545337677, "learning_rate": 2.2855421286559593e-05, "loss": 0.8218, "num_input_tokens_seen": 22246544, "step": 38580 }, { "epoch": 5.746946678582067, "grad_norm": 0.30604907870292664, "learning_rate": 2.2848946472011258e-05, "loss": 0.8179, "num_input_tokens_seen": 22249456, "step": 38585 }, { "epoch": 5.747691391123027, "grad_norm": 0.1837274581193924, "learning_rate": 2.2842471802822372e-05, "loss": 0.7757, "num_input_tokens_seen": 22252368, "step": 38590 }, { "epoch": 5.748436103663986, "grad_norm": 0.23482836782932281, "learning_rate": 2.2835997279430475e-05, "loss": 0.8279, "num_input_tokens_seen": 22255152, "step": 38595 }, { "epoch": 5.7491808162049445, "grad_norm": 0.19745776057243347, "learning_rate": 2.282952290227308e-05, "loss": 0.773, "num_input_tokens_seen": 22258288, "step": 38600 }, { "epoch": 5.749925528745904, "grad_norm": 0.18454813957214355, "learning_rate": 2.2823048671787715e-05, "loss": 0.7966, "num_input_tokens_seen": 22260976, "step": 38605 }, { "epoch": 5.750670241286863, "grad_norm": 0.1922357678413391, "learning_rate": 2.2816574588411857e-05, "loss": 0.8025, "num_input_tokens_seen": 22263760, "step": 38610 }, { "epoch": 5.7514149538278225, "grad_norm": 0.1751740425825119, "learning_rate": 2.2810100652583016e-05, "loss": 0.8105, "num_input_tokens_seen": 22266672, "step": 38615 }, { "epoch": 5.752159666368781, "grad_norm": 0.25301414728164673, "learning_rate": 2.2803626864738664e-05, "loss": 0.8166, "num_input_tokens_seen": 22269424, "step": 38620 }, { "epoch": 5.752904378909741, "grad_norm": 0.2580324709415436, "learning_rate": 2.279715322531628e-05, "loss": 0.7637, "num_input_tokens_seen": 22272464, "step": 38625 }, { "epoch": 5.7536490914507, "grad_norm": 0.25016143918037415, "learning_rate": 2.2790679734753327e-05, "loss": 0.806, "num_input_tokens_seen": 22275088, "step": 38630 }, { "epoch": 5.754393803991659, "grad_norm": 0.26278355717658997, "learning_rate": 2.2784206393487256e-05, "loss": 0.7765, "num_input_tokens_seen": 22277680, "step": 38635 }, { "epoch": 5.755138516532618, "grad_norm": 0.2591998279094696, "learning_rate": 2.277773320195551e-05, "loss": 0.8087, "num_input_tokens_seen": 22280624, "step": 38640 }, { "epoch": 5.755883229073578, "grad_norm": 0.22673802077770233, "learning_rate": 2.2771260160595516e-05, "loss": 0.8161, "num_input_tokens_seen": 22283344, "step": 38645 }, { "epoch": 5.7566279416145365, "grad_norm": 0.2540056109428406, "learning_rate": 2.2764787269844704e-05, "loss": 0.8188, "num_input_tokens_seen": 22286704, "step": 38650 }, { "epoch": 5.757372654155496, "grad_norm": 0.24248507618904114, "learning_rate": 2.2758314530140473e-05, "loss": 0.8172, "num_input_tokens_seen": 22290000, "step": 38655 }, { "epoch": 5.758117366696455, "grad_norm": 0.16120827198028564, "learning_rate": 2.2751841941920238e-05, "loss": 0.8205, "num_input_tokens_seen": 22293104, "step": 38660 }, { "epoch": 5.7588620792374146, "grad_norm": 0.24558863043785095, "learning_rate": 2.274536950562138e-05, "loss": 0.7852, "num_input_tokens_seen": 22296048, "step": 38665 }, { "epoch": 5.759606791778373, "grad_norm": 0.18630550801753998, "learning_rate": 2.2738897221681284e-05, "loss": 0.8244, "num_input_tokens_seen": 22298992, "step": 38670 }, { "epoch": 5.760351504319333, "grad_norm": 0.1561838984489441, "learning_rate": 2.2732425090537323e-05, "loss": 0.8236, "num_input_tokens_seen": 22302160, "step": 38675 }, { "epoch": 5.761096216860292, "grad_norm": 0.1873805671930313, "learning_rate": 2.272595311262685e-05, "loss": 0.79, "num_input_tokens_seen": 22305360, "step": 38680 }, { "epoch": 5.761840929401251, "grad_norm": 0.15531319379806519, "learning_rate": 2.2719481288387234e-05, "loss": 0.7993, "num_input_tokens_seen": 22308176, "step": 38685 }, { "epoch": 5.76258564194221, "grad_norm": 0.15078656375408173, "learning_rate": 2.2713009618255788e-05, "loss": 0.7957, "num_input_tokens_seen": 22311056, "step": 38690 }, { "epoch": 5.76333035448317, "grad_norm": 0.3065488040447235, "learning_rate": 2.270653810266986e-05, "loss": 0.8151, "num_input_tokens_seen": 22314224, "step": 38695 }, { "epoch": 5.7640750670241285, "grad_norm": 0.25399067997932434, "learning_rate": 2.2700066742066754e-05, "loss": 0.803, "num_input_tokens_seen": 22317104, "step": 38700 }, { "epoch": 5.764819779565088, "grad_norm": 0.22383596003055573, "learning_rate": 2.2693595536883792e-05, "loss": 0.7812, "num_input_tokens_seen": 22319952, "step": 38705 }, { "epoch": 5.765564492106047, "grad_norm": 0.27051907777786255, "learning_rate": 2.2687124487558267e-05, "loss": 0.7952, "num_input_tokens_seen": 22322864, "step": 38710 }, { "epoch": 5.766309204647007, "grad_norm": 0.1907336264848709, "learning_rate": 2.268065359452746e-05, "loss": 0.808, "num_input_tokens_seen": 22325648, "step": 38715 }, { "epoch": 5.767053917187965, "grad_norm": 0.23319607973098755, "learning_rate": 2.267418285822866e-05, "loss": 0.8036, "num_input_tokens_seen": 22328976, "step": 38720 }, { "epoch": 5.767798629728925, "grad_norm": 0.23221471905708313, "learning_rate": 2.266771227909913e-05, "loss": 0.8076, "num_input_tokens_seen": 22331696, "step": 38725 }, { "epoch": 5.768543342269884, "grad_norm": 0.2040918469429016, "learning_rate": 2.266124185757612e-05, "loss": 0.7946, "num_input_tokens_seen": 22334608, "step": 38730 }, { "epoch": 5.769288054810843, "grad_norm": 0.2736433148384094, "learning_rate": 2.265477159409687e-05, "loss": 0.7763, "num_input_tokens_seen": 22337360, "step": 38735 }, { "epoch": 5.770032767351802, "grad_norm": 0.2685556709766388, "learning_rate": 2.2648301489098624e-05, "loss": 0.7869, "num_input_tokens_seen": 22340272, "step": 38740 }, { "epoch": 5.770777479892761, "grad_norm": 0.30974993109703064, "learning_rate": 2.26418315430186e-05, "loss": 0.7975, "num_input_tokens_seen": 22343184, "step": 38745 }, { "epoch": 5.7715221924337206, "grad_norm": 0.21605826914310455, "learning_rate": 2.2635361756294014e-05, "loss": 0.78, "num_input_tokens_seen": 22345904, "step": 38750 }, { "epoch": 5.77226690497468, "grad_norm": 0.17196045815944672, "learning_rate": 2.2628892129362064e-05, "loss": 0.796, "num_input_tokens_seen": 22348432, "step": 38755 }, { "epoch": 5.773011617515639, "grad_norm": 0.19658038020133972, "learning_rate": 2.2622422662659952e-05, "loss": 0.7921, "num_input_tokens_seen": 22351152, "step": 38760 }, { "epoch": 5.773756330056598, "grad_norm": 0.2541671097278595, "learning_rate": 2.2615953356624852e-05, "loss": 0.8241, "num_input_tokens_seen": 22353936, "step": 38765 }, { "epoch": 5.774501042597557, "grad_norm": 0.22977915406227112, "learning_rate": 2.260948421169392e-05, "loss": 0.7945, "num_input_tokens_seen": 22356976, "step": 38770 }, { "epoch": 5.775245755138517, "grad_norm": 0.2811133563518524, "learning_rate": 2.260301522830433e-05, "loss": 0.8333, "num_input_tokens_seen": 22360048, "step": 38775 }, { "epoch": 5.775990467679476, "grad_norm": 0.2309812754392624, "learning_rate": 2.2596546406893227e-05, "loss": 0.8213, "num_input_tokens_seen": 22362768, "step": 38780 }, { "epoch": 5.7767351802204345, "grad_norm": 0.18979914486408234, "learning_rate": 2.259007774789774e-05, "loss": 0.8039, "num_input_tokens_seen": 22365808, "step": 38785 }, { "epoch": 5.777479892761394, "grad_norm": 0.2098831981420517, "learning_rate": 2.2583609251755004e-05, "loss": 0.777, "num_input_tokens_seen": 22368528, "step": 38790 }, { "epoch": 5.778224605302353, "grad_norm": 0.28849756717681885, "learning_rate": 2.2577140918902135e-05, "loss": 0.8261, "num_input_tokens_seen": 22371536, "step": 38795 }, { "epoch": 5.778969317843313, "grad_norm": 0.19893953204154968, "learning_rate": 2.2570672749776222e-05, "loss": 0.7897, "num_input_tokens_seen": 22374512, "step": 38800 }, { "epoch": 5.779714030384271, "grad_norm": 0.2284422069787979, "learning_rate": 2.2564204744814384e-05, "loss": 0.8105, "num_input_tokens_seen": 22377712, "step": 38805 }, { "epoch": 5.780458742925231, "grad_norm": 0.2049454152584076, "learning_rate": 2.2557736904453674e-05, "loss": 0.7977, "num_input_tokens_seen": 22380656, "step": 38810 }, { "epoch": 5.78120345546619, "grad_norm": 0.1786295771598816, "learning_rate": 2.255126922913118e-05, "loss": 0.7946, "num_input_tokens_seen": 22383568, "step": 38815 }, { "epoch": 5.781948168007149, "grad_norm": 0.2330089956521988, "learning_rate": 2.254480171928395e-05, "loss": 0.8009, "num_input_tokens_seen": 22386320, "step": 38820 }, { "epoch": 5.782692880548108, "grad_norm": 0.2534245550632477, "learning_rate": 2.2538334375349044e-05, "loss": 0.8046, "num_input_tokens_seen": 22389264, "step": 38825 }, { "epoch": 5.783437593089068, "grad_norm": 0.23975446820259094, "learning_rate": 2.2531867197763484e-05, "loss": 0.7632, "num_input_tokens_seen": 22392144, "step": 38830 }, { "epoch": 5.7841823056300266, "grad_norm": 0.2578939199447632, "learning_rate": 2.2525400186964308e-05, "loss": 0.8002, "num_input_tokens_seen": 22395024, "step": 38835 }, { "epoch": 5.784927018170986, "grad_norm": 0.21272973716259003, "learning_rate": 2.2518933343388528e-05, "loss": 0.7904, "num_input_tokens_seen": 22397776, "step": 38840 }, { "epoch": 5.785671730711945, "grad_norm": 0.26045161485671997, "learning_rate": 2.2512466667473152e-05, "loss": 0.7923, "num_input_tokens_seen": 22400976, "step": 38845 }, { "epoch": 5.786416443252905, "grad_norm": 0.21698158979415894, "learning_rate": 2.2506000159655158e-05, "loss": 0.8077, "num_input_tokens_seen": 22403952, "step": 38850 }, { "epoch": 5.787161155793863, "grad_norm": 0.18842531740665436, "learning_rate": 2.249953382037153e-05, "loss": 0.8, "num_input_tokens_seen": 22406960, "step": 38855 }, { "epoch": 5.787905868334823, "grad_norm": 0.26289013028144836, "learning_rate": 2.2493067650059247e-05, "loss": 0.7803, "num_input_tokens_seen": 22409968, "step": 38860 }, { "epoch": 5.788650580875782, "grad_norm": 0.2518022060394287, "learning_rate": 2.248660164915525e-05, "loss": 0.8171, "num_input_tokens_seen": 22412880, "step": 38865 }, { "epoch": 5.789395293416741, "grad_norm": 0.23334217071533203, "learning_rate": 2.2480135818096497e-05, "loss": 0.798, "num_input_tokens_seen": 22415568, "step": 38870 }, { "epoch": 5.7901400059577, "grad_norm": 0.24590077996253967, "learning_rate": 2.247367015731993e-05, "loss": 0.7995, "num_input_tokens_seen": 22418512, "step": 38875 }, { "epoch": 5.79088471849866, "grad_norm": 0.19199737906455994, "learning_rate": 2.2467204667262454e-05, "loss": 0.7915, "num_input_tokens_seen": 22421424, "step": 38880 }, { "epoch": 5.791629431039619, "grad_norm": 0.22198668122291565, "learning_rate": 2.2460739348361e-05, "loss": 0.8049, "num_input_tokens_seen": 22424592, "step": 38885 }, { "epoch": 5.792374143580578, "grad_norm": 0.22467032074928284, "learning_rate": 2.2454274201052443e-05, "loss": 0.7921, "num_input_tokens_seen": 22427440, "step": 38890 }, { "epoch": 5.793118856121537, "grad_norm": 0.21383161842823029, "learning_rate": 2.2447809225773698e-05, "loss": 0.842, "num_input_tokens_seen": 22430384, "step": 38895 }, { "epoch": 5.793863568662497, "grad_norm": 0.19067353010177612, "learning_rate": 2.2441344422961618e-05, "loss": 0.8111, "num_input_tokens_seen": 22433232, "step": 38900 }, { "epoch": 5.794608281203455, "grad_norm": 0.257943719625473, "learning_rate": 2.243487979305308e-05, "loss": 0.7969, "num_input_tokens_seen": 22436144, "step": 38905 }, { "epoch": 5.795352993744415, "grad_norm": 0.29099005460739136, "learning_rate": 2.2428415336484944e-05, "loss": 0.8119, "num_input_tokens_seen": 22439472, "step": 38910 }, { "epoch": 5.796097706285374, "grad_norm": 0.19100834429264069, "learning_rate": 2.2421951053694034e-05, "loss": 0.8183, "num_input_tokens_seen": 22442320, "step": 38915 }, { "epoch": 5.796842418826333, "grad_norm": 0.2021670639514923, "learning_rate": 2.2415486945117195e-05, "loss": 0.7883, "num_input_tokens_seen": 22444880, "step": 38920 }, { "epoch": 5.797587131367292, "grad_norm": 0.1904894858598709, "learning_rate": 2.2409023011191248e-05, "loss": 0.796, "num_input_tokens_seen": 22447632, "step": 38925 }, { "epoch": 5.798331843908251, "grad_norm": 0.17811718583106995, "learning_rate": 2.2402559252352988e-05, "loss": 0.7805, "num_input_tokens_seen": 22450288, "step": 38930 }, { "epoch": 5.799076556449211, "grad_norm": 0.15007495880126953, "learning_rate": 2.239609566903921e-05, "loss": 0.8053, "num_input_tokens_seen": 22453136, "step": 38935 }, { "epoch": 5.79982126899017, "grad_norm": 0.2702857553958893, "learning_rate": 2.23896322616867e-05, "loss": 0.8203, "num_input_tokens_seen": 22455888, "step": 38940 }, { "epoch": 5.800565981531129, "grad_norm": 0.22822892665863037, "learning_rate": 2.238316903073223e-05, "loss": 0.7952, "num_input_tokens_seen": 22458896, "step": 38945 }, { "epoch": 5.801310694072088, "grad_norm": 0.19236446917057037, "learning_rate": 2.2376705976612555e-05, "loss": 0.7958, "num_input_tokens_seen": 22461776, "step": 38950 }, { "epoch": 5.802055406613047, "grad_norm": 0.2826024293899536, "learning_rate": 2.2370243099764424e-05, "loss": 0.7859, "num_input_tokens_seen": 22464656, "step": 38955 }, { "epoch": 5.802800119154006, "grad_norm": 0.2608626186847687, "learning_rate": 2.2363780400624578e-05, "loss": 0.81, "num_input_tokens_seen": 22467536, "step": 38960 }, { "epoch": 5.803544831694966, "grad_norm": 0.19741415977478027, "learning_rate": 2.235731787962973e-05, "loss": 0.8151, "num_input_tokens_seen": 22470512, "step": 38965 }, { "epoch": 5.804289544235925, "grad_norm": 0.1750970333814621, "learning_rate": 2.2350855537216603e-05, "loss": 0.7881, "num_input_tokens_seen": 22473680, "step": 38970 }, { "epoch": 5.805034256776884, "grad_norm": 0.23871411383152008, "learning_rate": 2.234439337382188e-05, "loss": 0.7872, "num_input_tokens_seen": 22476656, "step": 38975 }, { "epoch": 5.805778969317843, "grad_norm": 0.3097381591796875, "learning_rate": 2.2337931389882262e-05, "loss": 0.7978, "num_input_tokens_seen": 22479568, "step": 38980 }, { "epoch": 5.806523681858803, "grad_norm": 0.1909860223531723, "learning_rate": 2.233146958583441e-05, "loss": 0.8199, "num_input_tokens_seen": 22482576, "step": 38985 }, { "epoch": 5.807268394399761, "grad_norm": 0.22776031494140625, "learning_rate": 2.2325007962115e-05, "loss": 0.8149, "num_input_tokens_seen": 22486768, "step": 38990 }, { "epoch": 5.808013106940721, "grad_norm": 0.14748035371303558, "learning_rate": 2.2318546519160672e-05, "loss": 0.8048, "num_input_tokens_seen": 22489488, "step": 38995 }, { "epoch": 5.80875781948168, "grad_norm": 0.21699272096157074, "learning_rate": 2.2312085257408066e-05, "loss": 0.797, "num_input_tokens_seen": 22492720, "step": 39000 }, { "epoch": 5.809502532022639, "grad_norm": 0.1664656698703766, "learning_rate": 2.2305624177293816e-05, "loss": 0.7979, "num_input_tokens_seen": 22495632, "step": 39005 }, { "epoch": 5.810247244563598, "grad_norm": 0.17084123194217682, "learning_rate": 2.2299163279254535e-05, "loss": 0.8111, "num_input_tokens_seen": 22498416, "step": 39010 }, { "epoch": 5.810991957104558, "grad_norm": 0.23717334866523743, "learning_rate": 2.229270256372681e-05, "loss": 0.8312, "num_input_tokens_seen": 22501136, "step": 39015 }, { "epoch": 5.811736669645517, "grad_norm": 0.3195718228816986, "learning_rate": 2.2286242031147236e-05, "loss": 0.8059, "num_input_tokens_seen": 22503984, "step": 39020 }, { "epoch": 5.812481382186476, "grad_norm": 0.2422681450843811, "learning_rate": 2.22797816819524e-05, "loss": 0.8108, "num_input_tokens_seen": 22507088, "step": 39025 }, { "epoch": 5.813226094727435, "grad_norm": 0.17187218368053436, "learning_rate": 2.227332151657885e-05, "loss": 0.8221, "num_input_tokens_seen": 22509968, "step": 39030 }, { "epoch": 5.813970807268395, "grad_norm": 0.2418108433485031, "learning_rate": 2.226686153546315e-05, "loss": 0.8246, "num_input_tokens_seen": 22512976, "step": 39035 }, { "epoch": 5.814715519809353, "grad_norm": 0.202362522482872, "learning_rate": 2.2260401739041837e-05, "loss": 0.7894, "num_input_tokens_seen": 22515952, "step": 39040 }, { "epoch": 5.815460232350313, "grad_norm": 0.2120276391506195, "learning_rate": 2.2253942127751432e-05, "loss": 0.7955, "num_input_tokens_seen": 22518864, "step": 39045 }, { "epoch": 5.816204944891272, "grad_norm": 0.25150835514068604, "learning_rate": 2.2247482702028466e-05, "loss": 0.821, "num_input_tokens_seen": 22521520, "step": 39050 }, { "epoch": 5.8169496574322315, "grad_norm": 0.20759810507297516, "learning_rate": 2.2241023462309416e-05, "loss": 0.7889, "num_input_tokens_seen": 22524656, "step": 39055 }, { "epoch": 5.81769436997319, "grad_norm": 0.21549227833747864, "learning_rate": 2.2234564409030788e-05, "loss": 0.8019, "num_input_tokens_seen": 22527536, "step": 39060 }, { "epoch": 5.81843908251415, "grad_norm": 0.234014093875885, "learning_rate": 2.2228105542629047e-05, "loss": 0.7992, "num_input_tokens_seen": 22530416, "step": 39065 }, { "epoch": 5.819183795055109, "grad_norm": 0.18918752670288086, "learning_rate": 2.2221646863540664e-05, "loss": 0.7948, "num_input_tokens_seen": 22533296, "step": 39070 }, { "epoch": 5.819928507596068, "grad_norm": 0.2741568088531494, "learning_rate": 2.2215188372202097e-05, "loss": 0.8177, "num_input_tokens_seen": 22536176, "step": 39075 }, { "epoch": 5.820673220137027, "grad_norm": 0.23449978232383728, "learning_rate": 2.2208730069049775e-05, "loss": 0.8159, "num_input_tokens_seen": 22538768, "step": 39080 }, { "epoch": 5.821417932677987, "grad_norm": 0.22629739344120026, "learning_rate": 2.220227195452012e-05, "loss": 0.7848, "num_input_tokens_seen": 22541744, "step": 39085 }, { "epoch": 5.822162645218945, "grad_norm": 0.2453158050775528, "learning_rate": 2.2195814029049568e-05, "loss": 0.8064, "num_input_tokens_seen": 22544624, "step": 39090 }, { "epoch": 5.822907357759904, "grad_norm": 0.1909223049879074, "learning_rate": 2.2189356293074495e-05, "loss": 0.8016, "num_input_tokens_seen": 22547184, "step": 39095 }, { "epoch": 5.823652070300864, "grad_norm": 0.22170841693878174, "learning_rate": 2.218289874703129e-05, "loss": 0.7765, "num_input_tokens_seen": 22549872, "step": 39100 }, { "epoch": 5.8243967828418235, "grad_norm": 0.19192293286323547, "learning_rate": 2.2176441391356336e-05, "loss": 0.8079, "num_input_tokens_seen": 22552816, "step": 39105 }, { "epoch": 5.825141495382782, "grad_norm": 0.26105207204818726, "learning_rate": 2.2169984226485998e-05, "loss": 0.8005, "num_input_tokens_seen": 22555792, "step": 39110 }, { "epoch": 5.825886207923741, "grad_norm": 0.25950703024864197, "learning_rate": 2.2163527252856614e-05, "loss": 0.8251, "num_input_tokens_seen": 22558832, "step": 39115 }, { "epoch": 5.826630920464701, "grad_norm": 0.21083508431911469, "learning_rate": 2.2157070470904528e-05, "loss": 0.78, "num_input_tokens_seen": 22561744, "step": 39120 }, { "epoch": 5.82737563300566, "grad_norm": 0.22401581704616547, "learning_rate": 2.2150613881066063e-05, "loss": 0.7808, "num_input_tokens_seen": 22564528, "step": 39125 }, { "epoch": 5.828120345546619, "grad_norm": 0.19165657460689545, "learning_rate": 2.2144157483777538e-05, "loss": 0.8137, "num_input_tokens_seen": 22567696, "step": 39130 }, { "epoch": 5.828865058087578, "grad_norm": 0.20937573909759521, "learning_rate": 2.2137701279475224e-05, "loss": 0.7909, "num_input_tokens_seen": 22570384, "step": 39135 }, { "epoch": 5.8296097706285375, "grad_norm": 0.19479553401470184, "learning_rate": 2.213124526859542e-05, "loss": 0.8024, "num_input_tokens_seen": 22573360, "step": 39140 }, { "epoch": 5.830354483169496, "grad_norm": 0.23151956498622894, "learning_rate": 2.2124789451574405e-05, "loss": 0.7814, "num_input_tokens_seen": 22576304, "step": 39145 }, { "epoch": 5.831099195710456, "grad_norm": 0.21965081989765167, "learning_rate": 2.2118333828848422e-05, "loss": 0.8102, "num_input_tokens_seen": 22578960, "step": 39150 }, { "epoch": 5.831843908251415, "grad_norm": 0.1908380687236786, "learning_rate": 2.2111878400853732e-05, "loss": 0.7988, "num_input_tokens_seen": 22582160, "step": 39155 }, { "epoch": 5.832588620792374, "grad_norm": 0.21503113210201263, "learning_rate": 2.2105423168026545e-05, "loss": 0.8069, "num_input_tokens_seen": 22584912, "step": 39160 }, { "epoch": 5.833333333333333, "grad_norm": 0.28116750717163086, "learning_rate": 2.2098968130803096e-05, "loss": 0.7792, "num_input_tokens_seen": 22587824, "step": 39165 }, { "epoch": 5.834078045874293, "grad_norm": 0.19106416404247284, "learning_rate": 2.2092513289619597e-05, "loss": 0.7839, "num_input_tokens_seen": 22590736, "step": 39170 }, { "epoch": 5.834822758415251, "grad_norm": 0.22772081196308136, "learning_rate": 2.208605864491222e-05, "loss": 0.8291, "num_input_tokens_seen": 22593552, "step": 39175 }, { "epoch": 5.835567470956211, "grad_norm": 0.23664076626300812, "learning_rate": 2.2079604197117152e-05, "loss": 0.7777, "num_input_tokens_seen": 22596752, "step": 39180 }, { "epoch": 5.83631218349717, "grad_norm": 0.22457614541053772, "learning_rate": 2.2073149946670556e-05, "loss": 0.8073, "num_input_tokens_seen": 22599376, "step": 39185 }, { "epoch": 5.8370568960381295, "grad_norm": 0.16688895225524902, "learning_rate": 2.2066695894008595e-05, "loss": 0.8139, "num_input_tokens_seen": 22602288, "step": 39190 }, { "epoch": 5.837801608579088, "grad_norm": 0.2439187914133072, "learning_rate": 2.2060242039567393e-05, "loss": 0.7966, "num_input_tokens_seen": 22605264, "step": 39195 }, { "epoch": 5.838546321120048, "grad_norm": 0.27043119072914124, "learning_rate": 2.205378838378308e-05, "loss": 0.7882, "num_input_tokens_seen": 22608208, "step": 39200 }, { "epoch": 5.839291033661007, "grad_norm": 0.2333611696958542, "learning_rate": 2.204733492709178e-05, "loss": 0.8108, "num_input_tokens_seen": 22610960, "step": 39205 }, { "epoch": 5.840035746201966, "grad_norm": 0.2388993352651596, "learning_rate": 2.2040881669929582e-05, "loss": 0.7852, "num_input_tokens_seen": 22613840, "step": 39210 }, { "epoch": 5.840780458742925, "grad_norm": 0.17142611742019653, "learning_rate": 2.203442861273256e-05, "loss": 0.7941, "num_input_tokens_seen": 22616592, "step": 39215 }, { "epoch": 5.841525171283885, "grad_norm": 0.20425722002983093, "learning_rate": 2.202797575593679e-05, "loss": 0.8021, "num_input_tokens_seen": 22619376, "step": 39220 }, { "epoch": 5.8422698838248435, "grad_norm": 0.20305517315864563, "learning_rate": 2.2021523099978347e-05, "loss": 0.8191, "num_input_tokens_seen": 22622160, "step": 39225 }, { "epoch": 5.843014596365803, "grad_norm": 0.19180642068386078, "learning_rate": 2.2015070645293257e-05, "loss": 0.7865, "num_input_tokens_seen": 22624816, "step": 39230 }, { "epoch": 5.843759308906762, "grad_norm": 0.24362705647945404, "learning_rate": 2.2008618392317557e-05, "loss": 0.8237, "num_input_tokens_seen": 22627568, "step": 39235 }, { "epoch": 5.8445040214477215, "grad_norm": 0.25466397404670715, "learning_rate": 2.2002166341487267e-05, "loss": 0.8193, "num_input_tokens_seen": 22630192, "step": 39240 }, { "epoch": 5.84524873398868, "grad_norm": 0.19734178483486176, "learning_rate": 2.1995714493238383e-05, "loss": 0.8126, "num_input_tokens_seen": 22633104, "step": 39245 }, { "epoch": 5.84599344652964, "grad_norm": 0.34979015588760376, "learning_rate": 2.1989262848006912e-05, "loss": 0.7887, "num_input_tokens_seen": 22636112, "step": 39250 }, { "epoch": 5.846738159070599, "grad_norm": 0.2863588035106659, "learning_rate": 2.1982811406228805e-05, "loss": 0.8184, "num_input_tokens_seen": 22638960, "step": 39255 }, { "epoch": 5.847482871611557, "grad_norm": 0.17849276959896088, "learning_rate": 2.1976360168340042e-05, "loss": 0.7866, "num_input_tokens_seen": 22641808, "step": 39260 }, { "epoch": 5.848227584152517, "grad_norm": 0.2192312330007553, "learning_rate": 2.1969909134776555e-05, "loss": 0.7898, "num_input_tokens_seen": 22644560, "step": 39265 }, { "epoch": 5.848972296693477, "grad_norm": 0.19043207168579102, "learning_rate": 2.1963458305974297e-05, "loss": 0.7871, "num_input_tokens_seen": 22647248, "step": 39270 }, { "epoch": 5.8497170092344355, "grad_norm": 0.18449655175209045, "learning_rate": 2.1957007682369182e-05, "loss": 0.7959, "num_input_tokens_seen": 22650000, "step": 39275 }, { "epoch": 5.850461721775394, "grad_norm": 0.2133496105670929, "learning_rate": 2.195055726439711e-05, "loss": 0.8205, "num_input_tokens_seen": 22652752, "step": 39280 }, { "epoch": 5.851206434316354, "grad_norm": 0.3251565992832184, "learning_rate": 2.1944107052493984e-05, "loss": 0.846, "num_input_tokens_seen": 22655952, "step": 39285 }, { "epoch": 5.8519511468573135, "grad_norm": 0.17425751686096191, "learning_rate": 2.1937657047095687e-05, "loss": 0.7891, "num_input_tokens_seen": 22658480, "step": 39290 }, { "epoch": 5.852695859398272, "grad_norm": 0.2244100570678711, "learning_rate": 2.193120724863807e-05, "loss": 0.7791, "num_input_tokens_seen": 22661488, "step": 39295 }, { "epoch": 5.853440571939231, "grad_norm": 0.24569986760616302, "learning_rate": 2.1924757657556986e-05, "loss": 0.8103, "num_input_tokens_seen": 22664496, "step": 39300 }, { "epoch": 5.854185284480191, "grad_norm": 0.17133638262748718, "learning_rate": 2.1918308274288278e-05, "loss": 0.8053, "num_input_tokens_seen": 22667376, "step": 39305 }, { "epoch": 5.8549299970211495, "grad_norm": 0.29672709107398987, "learning_rate": 2.191185909926777e-05, "loss": 0.7811, "num_input_tokens_seen": 22670096, "step": 39310 }, { "epoch": 5.855674709562109, "grad_norm": 0.18476174771785736, "learning_rate": 2.1905410132931263e-05, "loss": 0.7929, "num_input_tokens_seen": 22672752, "step": 39315 }, { "epoch": 5.856419422103068, "grad_norm": 0.20335018634796143, "learning_rate": 2.1898961375714567e-05, "loss": 0.7708, "num_input_tokens_seen": 22676016, "step": 39320 }, { "epoch": 5.8571641346440275, "grad_norm": 0.18976660072803497, "learning_rate": 2.1892512828053443e-05, "loss": 0.7786, "num_input_tokens_seen": 22678992, "step": 39325 }, { "epoch": 5.857908847184986, "grad_norm": 0.2213248461484909, "learning_rate": 2.1886064490383682e-05, "loss": 0.8138, "num_input_tokens_seen": 22681840, "step": 39330 }, { "epoch": 5.858653559725946, "grad_norm": 0.20023375749588013, "learning_rate": 2.1879616363141012e-05, "loss": 0.8189, "num_input_tokens_seen": 22684624, "step": 39335 }, { "epoch": 5.859398272266905, "grad_norm": 0.21608898043632507, "learning_rate": 2.1873168446761184e-05, "loss": 0.8172, "num_input_tokens_seen": 22687280, "step": 39340 }, { "epoch": 5.860142984807864, "grad_norm": 0.19925197958946228, "learning_rate": 2.1866720741679918e-05, "loss": 0.7854, "num_input_tokens_seen": 22690480, "step": 39345 }, { "epoch": 5.860887697348823, "grad_norm": 0.3164004683494568, "learning_rate": 2.186027324833292e-05, "loss": 0.8289, "num_input_tokens_seen": 22693744, "step": 39350 }, { "epoch": 5.861632409889783, "grad_norm": 0.23564870655536652, "learning_rate": 2.18538259671559e-05, "loss": 0.8065, "num_input_tokens_seen": 22696816, "step": 39355 }, { "epoch": 5.8623771224307415, "grad_norm": 0.21779701113700867, "learning_rate": 2.1847378898584524e-05, "loss": 0.7984, "num_input_tokens_seen": 22699632, "step": 39360 }, { "epoch": 5.863121834971701, "grad_norm": 0.23851674795150757, "learning_rate": 2.184093204305446e-05, "loss": 0.7913, "num_input_tokens_seen": 22702640, "step": 39365 }, { "epoch": 5.86386654751266, "grad_norm": 0.20732200145721436, "learning_rate": 2.1834485401001384e-05, "loss": 0.7979, "num_input_tokens_seen": 22705648, "step": 39370 }, { "epoch": 5.8646112600536195, "grad_norm": 0.1671118289232254, "learning_rate": 2.1828038972860904e-05, "loss": 0.7947, "num_input_tokens_seen": 22708144, "step": 39375 }, { "epoch": 5.865355972594578, "grad_norm": 0.24562793970108032, "learning_rate": 2.182159275906865e-05, "loss": 0.808, "num_input_tokens_seen": 22710896, "step": 39380 }, { "epoch": 5.866100685135538, "grad_norm": 0.1753314584493637, "learning_rate": 2.1815146760060234e-05, "loss": 0.7949, "num_input_tokens_seen": 22713872, "step": 39385 }, { "epoch": 5.866845397676497, "grad_norm": 0.16164933145046234, "learning_rate": 2.1808700976271256e-05, "loss": 0.784, "num_input_tokens_seen": 22716688, "step": 39390 }, { "epoch": 5.867590110217456, "grad_norm": 0.2901461124420166, "learning_rate": 2.1802255408137286e-05, "loss": 0.814, "num_input_tokens_seen": 22719344, "step": 39395 }, { "epoch": 5.868334822758415, "grad_norm": 0.187490314245224, "learning_rate": 2.1795810056093896e-05, "loss": 0.8207, "num_input_tokens_seen": 22722032, "step": 39400 }, { "epoch": 5.869079535299375, "grad_norm": 0.19772405922412872, "learning_rate": 2.178936492057664e-05, "loss": 0.7888, "num_input_tokens_seen": 22724848, "step": 39405 }, { "epoch": 5.8698242478403335, "grad_norm": 0.2797966003417969, "learning_rate": 2.1782920002021054e-05, "loss": 0.7802, "num_input_tokens_seen": 22728880, "step": 39410 }, { "epoch": 5.870568960381293, "grad_norm": 0.17878618836402893, "learning_rate": 2.1776475300862646e-05, "loss": 0.8153, "num_input_tokens_seen": 22731568, "step": 39415 }, { "epoch": 5.871313672922252, "grad_norm": 0.21706610918045044, "learning_rate": 2.1770030817536928e-05, "loss": 0.8038, "num_input_tokens_seen": 22734416, "step": 39420 }, { "epoch": 5.872058385463212, "grad_norm": 0.19084687530994415, "learning_rate": 2.17635865524794e-05, "loss": 0.7913, "num_input_tokens_seen": 22737392, "step": 39425 }, { "epoch": 5.87280309800417, "grad_norm": 0.208163782954216, "learning_rate": 2.1757142506125534e-05, "loss": 0.7953, "num_input_tokens_seen": 22739920, "step": 39430 }, { "epoch": 5.87354781054513, "grad_norm": 0.14565573632717133, "learning_rate": 2.1750698678910788e-05, "loss": 0.8075, "num_input_tokens_seen": 22742992, "step": 39435 }, { "epoch": 5.874292523086089, "grad_norm": 0.249307781457901, "learning_rate": 2.174425507127062e-05, "loss": 0.7743, "num_input_tokens_seen": 22746096, "step": 39440 }, { "epoch": 5.8750372356270475, "grad_norm": 0.24588918685913086, "learning_rate": 2.1737811683640455e-05, "loss": 0.8086, "num_input_tokens_seen": 22749200, "step": 39445 }, { "epoch": 5.875781948168007, "grad_norm": 0.1957862228155136, "learning_rate": 2.1731368516455723e-05, "loss": 0.7975, "num_input_tokens_seen": 22752080, "step": 39450 }, { "epoch": 5.876526660708967, "grad_norm": 0.2798534333705902, "learning_rate": 2.1724925570151806e-05, "loss": 0.8125, "num_input_tokens_seen": 22755152, "step": 39455 }, { "epoch": 5.8772713732499255, "grad_norm": 0.23178578913211823, "learning_rate": 2.171848284516411e-05, "loss": 0.7822, "num_input_tokens_seen": 22757968, "step": 39460 }, { "epoch": 5.878016085790884, "grad_norm": 0.21855439245700836, "learning_rate": 2.1712040341927998e-05, "loss": 0.8168, "num_input_tokens_seen": 22761008, "step": 39465 }, { "epoch": 5.878760798331844, "grad_norm": 0.21386808156967163, "learning_rate": 2.170559806087883e-05, "loss": 0.7944, "num_input_tokens_seen": 22763856, "step": 39470 }, { "epoch": 5.879505510872804, "grad_norm": 0.19930580258369446, "learning_rate": 2.1699156002451954e-05, "loss": 0.8307, "num_input_tokens_seen": 22766480, "step": 39475 }, { "epoch": 5.880250223413762, "grad_norm": 0.20947851240634918, "learning_rate": 2.169271416708269e-05, "loss": 0.7988, "num_input_tokens_seen": 22769072, "step": 39480 }, { "epoch": 5.880994935954721, "grad_norm": 0.35331863164901733, "learning_rate": 2.1686272555206363e-05, "loss": 0.8049, "num_input_tokens_seen": 22771888, "step": 39485 }, { "epoch": 5.881739648495681, "grad_norm": 0.21222048997879028, "learning_rate": 2.1679831167258267e-05, "loss": 0.8145, "num_input_tokens_seen": 22774768, "step": 39490 }, { "epoch": 5.8824843610366395, "grad_norm": 0.18718665838241577, "learning_rate": 2.1673390003673678e-05, "loss": 0.8064, "num_input_tokens_seen": 22777584, "step": 39495 }, { "epoch": 5.883229073577599, "grad_norm": 0.24680469930171967, "learning_rate": 2.1666949064887862e-05, "loss": 0.7812, "num_input_tokens_seen": 22780080, "step": 39500 }, { "epoch": 5.883973786118558, "grad_norm": 0.24586938321590424, "learning_rate": 2.1660508351336086e-05, "loss": 0.8061, "num_input_tokens_seen": 22782960, "step": 39505 }, { "epoch": 5.884718498659518, "grad_norm": 0.14536143839359283, "learning_rate": 2.1654067863453568e-05, "loss": 0.8005, "num_input_tokens_seen": 22785648, "step": 39510 }, { "epoch": 5.885463211200476, "grad_norm": 0.23009167611598969, "learning_rate": 2.1647627601675542e-05, "loss": 0.7969, "num_input_tokens_seen": 22788432, "step": 39515 }, { "epoch": 5.886207923741436, "grad_norm": 0.25629597902297974, "learning_rate": 2.164118756643722e-05, "loss": 0.7926, "num_input_tokens_seen": 22791504, "step": 39520 }, { "epoch": 5.886952636282395, "grad_norm": 0.15581870079040527, "learning_rate": 2.163474775817378e-05, "loss": 0.8088, "num_input_tokens_seen": 22794256, "step": 39525 }, { "epoch": 5.887697348823354, "grad_norm": 0.18975591659545898, "learning_rate": 2.1628308177320418e-05, "loss": 0.7902, "num_input_tokens_seen": 22797104, "step": 39530 }, { "epoch": 5.888442061364313, "grad_norm": 0.1925438791513443, "learning_rate": 2.1621868824312264e-05, "loss": 0.7748, "num_input_tokens_seen": 22799952, "step": 39535 }, { "epoch": 5.889186773905273, "grad_norm": 0.1779216229915619, "learning_rate": 2.161542969958449e-05, "loss": 0.7933, "num_input_tokens_seen": 22802608, "step": 39540 }, { "epoch": 5.8899314864462315, "grad_norm": 0.21973644196987152, "learning_rate": 2.160899080357221e-05, "loss": 0.7979, "num_input_tokens_seen": 22805936, "step": 39545 }, { "epoch": 5.890676198987191, "grad_norm": 0.21984335780143738, "learning_rate": 2.1602552136710543e-05, "loss": 0.8311, "num_input_tokens_seen": 22808848, "step": 39550 }, { "epoch": 5.89142091152815, "grad_norm": 0.18135453760623932, "learning_rate": 2.1596113699434597e-05, "loss": 0.8145, "num_input_tokens_seen": 22811696, "step": 39555 }, { "epoch": 5.89216562406911, "grad_norm": 0.20915210247039795, "learning_rate": 2.1589675492179444e-05, "loss": 0.8169, "num_input_tokens_seen": 22814864, "step": 39560 }, { "epoch": 5.892910336610068, "grad_norm": 0.22198772430419922, "learning_rate": 2.1583237515380153e-05, "loss": 0.8119, "num_input_tokens_seen": 22817872, "step": 39565 }, { "epoch": 5.893655049151028, "grad_norm": 0.26680684089660645, "learning_rate": 2.1576799769471787e-05, "loss": 0.8421, "num_input_tokens_seen": 22820720, "step": 39570 }, { "epoch": 5.894399761691987, "grad_norm": 0.2230926752090454, "learning_rate": 2.157036225488938e-05, "loss": 0.788, "num_input_tokens_seen": 22823536, "step": 39575 }, { "epoch": 5.895144474232946, "grad_norm": 0.2514615058898926, "learning_rate": 2.1563924972067934e-05, "loss": 0.8418, "num_input_tokens_seen": 22826608, "step": 39580 }, { "epoch": 5.895889186773905, "grad_norm": 0.15674114227294922, "learning_rate": 2.155748792144247e-05, "loss": 0.7945, "num_input_tokens_seen": 22829488, "step": 39585 }, { "epoch": 5.896633899314865, "grad_norm": 0.40445610880851746, "learning_rate": 2.1551051103447982e-05, "loss": 0.776, "num_input_tokens_seen": 22832464, "step": 39590 }, { "epoch": 5.897378611855824, "grad_norm": 0.25205036997795105, "learning_rate": 2.1544614518519434e-05, "loss": 0.8118, "num_input_tokens_seen": 22835184, "step": 39595 }, { "epoch": 5.898123324396783, "grad_norm": 0.21595534682273865, "learning_rate": 2.1538178167091787e-05, "loss": 0.8077, "num_input_tokens_seen": 22838000, "step": 39600 }, { "epoch": 5.898868036937742, "grad_norm": 0.18466559052467346, "learning_rate": 2.153174204959999e-05, "loss": 0.7747, "num_input_tokens_seen": 22840880, "step": 39605 }, { "epoch": 5.899612749478701, "grad_norm": 0.2720571756362915, "learning_rate": 2.1525306166478957e-05, "loss": 0.8256, "num_input_tokens_seen": 22844176, "step": 39610 }, { "epoch": 5.90035746201966, "grad_norm": 0.18929700553417206, "learning_rate": 2.151887051816362e-05, "loss": 0.793, "num_input_tokens_seen": 22846992, "step": 39615 }, { "epoch": 5.90110217456062, "grad_norm": 0.21125109493732452, "learning_rate": 2.1512435105088847e-05, "loss": 0.7844, "num_input_tokens_seen": 22849680, "step": 39620 }, { "epoch": 5.901846887101579, "grad_norm": 0.18263603746891022, "learning_rate": 2.1505999927689536e-05, "loss": 0.7745, "num_input_tokens_seen": 22852688, "step": 39625 }, { "epoch": 5.9025915996425375, "grad_norm": 0.19816231727600098, "learning_rate": 2.149956498640054e-05, "loss": 0.7887, "num_input_tokens_seen": 22855952, "step": 39630 }, { "epoch": 5.903336312183497, "grad_norm": 0.2044331133365631, "learning_rate": 2.1493130281656708e-05, "loss": 0.8058, "num_input_tokens_seen": 22858768, "step": 39635 }, { "epoch": 5.904081024724457, "grad_norm": 0.21854735910892487, "learning_rate": 2.1486695813892883e-05, "loss": 0.8222, "num_input_tokens_seen": 22861424, "step": 39640 }, { "epoch": 5.904825737265416, "grad_norm": 0.1912318468093872, "learning_rate": 2.1480261583543866e-05, "loss": 0.7927, "num_input_tokens_seen": 22864336, "step": 39645 }, { "epoch": 5.905570449806374, "grad_norm": 0.18161800503730774, "learning_rate": 2.1473827591044464e-05, "loss": 0.7651, "num_input_tokens_seen": 22867088, "step": 39650 }, { "epoch": 5.906315162347334, "grad_norm": 0.207632914185524, "learning_rate": 2.1467393836829454e-05, "loss": 0.7939, "num_input_tokens_seen": 22869936, "step": 39655 }, { "epoch": 5.907059874888293, "grad_norm": 0.22253280878067017, "learning_rate": 2.146096032133361e-05, "loss": 0.808, "num_input_tokens_seen": 22872976, "step": 39660 }, { "epoch": 5.907804587429252, "grad_norm": 0.22675617039203644, "learning_rate": 2.1454527044991673e-05, "loss": 0.792, "num_input_tokens_seen": 22875824, "step": 39665 }, { "epoch": 5.908549299970211, "grad_norm": 0.23462459444999695, "learning_rate": 2.144809400823839e-05, "loss": 0.779, "num_input_tokens_seen": 22878672, "step": 39670 }, { "epoch": 5.909294012511171, "grad_norm": 0.17927046120166779, "learning_rate": 2.1441661211508465e-05, "loss": 0.7563, "num_input_tokens_seen": 22881392, "step": 39675 }, { "epoch": 5.91003872505213, "grad_norm": 0.3564645051956177, "learning_rate": 2.1435228655236608e-05, "loss": 0.7649, "num_input_tokens_seen": 22884144, "step": 39680 }, { "epoch": 5.910783437593089, "grad_norm": 0.1825801134109497, "learning_rate": 2.1428796339857513e-05, "loss": 0.8219, "num_input_tokens_seen": 22886960, "step": 39685 }, { "epoch": 5.911528150134048, "grad_norm": 0.19214752316474915, "learning_rate": 2.1422364265805832e-05, "loss": 0.8164, "num_input_tokens_seen": 22889520, "step": 39690 }, { "epoch": 5.912272862675008, "grad_norm": 0.21059416234493256, "learning_rate": 2.1415932433516243e-05, "loss": 0.8025, "num_input_tokens_seen": 22892240, "step": 39695 }, { "epoch": 5.913017575215966, "grad_norm": 0.22235190868377686, "learning_rate": 2.140950084342336e-05, "loss": 0.7789, "num_input_tokens_seen": 22895024, "step": 39700 }, { "epoch": 5.913762287756926, "grad_norm": 0.20867201685905457, "learning_rate": 2.1403069495961813e-05, "loss": 0.8158, "num_input_tokens_seen": 22898000, "step": 39705 }, { "epoch": 5.914507000297885, "grad_norm": 0.170180544257164, "learning_rate": 2.13966383915662e-05, "loss": 0.8034, "num_input_tokens_seen": 22900848, "step": 39710 }, { "epoch": 5.915251712838844, "grad_norm": 0.24240174889564514, "learning_rate": 2.1390207530671115e-05, "loss": 0.7779, "num_input_tokens_seen": 22903728, "step": 39715 }, { "epoch": 5.915996425379803, "grad_norm": 0.22085270285606384, "learning_rate": 2.1383776913711135e-05, "loss": 0.8092, "num_input_tokens_seen": 22906480, "step": 39720 }, { "epoch": 5.916741137920763, "grad_norm": 0.21368125081062317, "learning_rate": 2.1377346541120803e-05, "loss": 0.8182, "num_input_tokens_seen": 22909072, "step": 39725 }, { "epoch": 5.917485850461722, "grad_norm": 0.26204919815063477, "learning_rate": 2.1370916413334663e-05, "loss": 0.7905, "num_input_tokens_seen": 22912368, "step": 39730 }, { "epoch": 5.918230563002681, "grad_norm": 0.3036293685436249, "learning_rate": 2.1364486530787247e-05, "loss": 0.7837, "num_input_tokens_seen": 22915088, "step": 39735 }, { "epoch": 5.91897527554364, "grad_norm": 0.23315075039863586, "learning_rate": 2.1358056893913047e-05, "loss": 0.8168, "num_input_tokens_seen": 22918064, "step": 39740 }, { "epoch": 5.9197199880846, "grad_norm": 0.2735464572906494, "learning_rate": 2.1351627503146547e-05, "loss": 0.8067, "num_input_tokens_seen": 22920688, "step": 39745 }, { "epoch": 5.920464700625558, "grad_norm": 0.28233030438423157, "learning_rate": 2.134519835892223e-05, "loss": 0.8172, "num_input_tokens_seen": 22923760, "step": 39750 }, { "epoch": 5.921209413166518, "grad_norm": 0.18257035315036774, "learning_rate": 2.133876946167455e-05, "loss": 0.7836, "num_input_tokens_seen": 22926704, "step": 39755 }, { "epoch": 5.921954125707477, "grad_norm": 0.17648227512836456, "learning_rate": 2.1332340811837944e-05, "loss": 0.7683, "num_input_tokens_seen": 22929520, "step": 39760 }, { "epoch": 5.9226988382484365, "grad_norm": 0.2074262648820877, "learning_rate": 2.1325912409846834e-05, "loss": 0.819, "num_input_tokens_seen": 22932496, "step": 39765 }, { "epoch": 5.923443550789395, "grad_norm": 0.26111963391304016, "learning_rate": 2.131948425613563e-05, "loss": 0.8214, "num_input_tokens_seen": 22935216, "step": 39770 }, { "epoch": 5.924188263330355, "grad_norm": 0.2832421660423279, "learning_rate": 2.1313056351138715e-05, "loss": 0.8234, "num_input_tokens_seen": 22938224, "step": 39775 }, { "epoch": 5.924932975871314, "grad_norm": 0.32013392448425293, "learning_rate": 2.1306628695290458e-05, "loss": 0.8241, "num_input_tokens_seen": 22941040, "step": 39780 }, { "epoch": 5.925677688412273, "grad_norm": 0.2002505362033844, "learning_rate": 2.1300201289025215e-05, "loss": 0.8107, "num_input_tokens_seen": 22943504, "step": 39785 }, { "epoch": 5.926422400953232, "grad_norm": 0.18161100149154663, "learning_rate": 2.1293774132777332e-05, "loss": 0.8065, "num_input_tokens_seen": 22946160, "step": 39790 }, { "epoch": 5.927167113494191, "grad_norm": 0.20846624672412872, "learning_rate": 2.128734722698112e-05, "loss": 0.8222, "num_input_tokens_seen": 22949296, "step": 39795 }, { "epoch": 5.92791182603515, "grad_norm": 0.25143900513648987, "learning_rate": 2.128092057207089e-05, "loss": 0.7969, "num_input_tokens_seen": 22952176, "step": 39800 }, { "epoch": 5.92865653857611, "grad_norm": 0.2365494668483734, "learning_rate": 2.127449416848093e-05, "loss": 0.8044, "num_input_tokens_seen": 22954864, "step": 39805 }, { "epoch": 5.929401251117069, "grad_norm": 0.1447133868932724, "learning_rate": 2.1268068016645505e-05, "loss": 0.8095, "num_input_tokens_seen": 22957680, "step": 39810 }, { "epoch": 5.930145963658028, "grad_norm": 0.23113980889320374, "learning_rate": 2.1261642116998877e-05, "loss": 0.8404, "num_input_tokens_seen": 22960432, "step": 39815 }, { "epoch": 5.930890676198987, "grad_norm": 0.17714424431324005, "learning_rate": 2.1255216469975265e-05, "loss": 0.8065, "num_input_tokens_seen": 22963120, "step": 39820 }, { "epoch": 5.931635388739946, "grad_norm": 0.23990438878536224, "learning_rate": 2.1248791076008906e-05, "loss": 0.8201, "num_input_tokens_seen": 22965872, "step": 39825 }, { "epoch": 5.932380101280906, "grad_norm": 0.3074108958244324, "learning_rate": 2.1242365935533988e-05, "loss": 0.8185, "num_input_tokens_seen": 22968528, "step": 39830 }, { "epoch": 5.933124813821864, "grad_norm": 0.20603398978710175, "learning_rate": 2.123594104898471e-05, "loss": 0.8155, "num_input_tokens_seen": 22971440, "step": 39835 }, { "epoch": 5.933869526362824, "grad_norm": 0.190220445394516, "learning_rate": 2.1229516416795224e-05, "loss": 0.8065, "num_input_tokens_seen": 22974320, "step": 39840 }, { "epoch": 5.934614238903783, "grad_norm": 0.219956636428833, "learning_rate": 2.1223092039399695e-05, "loss": 0.7963, "num_input_tokens_seen": 22977616, "step": 39845 }, { "epoch": 5.9353589514447425, "grad_norm": 0.3075593113899231, "learning_rate": 2.121666791723225e-05, "loss": 0.8258, "num_input_tokens_seen": 22980432, "step": 39850 }, { "epoch": 5.936103663985701, "grad_norm": 0.18993863463401794, "learning_rate": 2.1210244050727014e-05, "loss": 0.8049, "num_input_tokens_seen": 22983216, "step": 39855 }, { "epoch": 5.936848376526661, "grad_norm": 0.18288885056972504, "learning_rate": 2.1203820440318063e-05, "loss": 0.8087, "num_input_tokens_seen": 22985936, "step": 39860 }, { "epoch": 5.93759308906762, "grad_norm": 0.23598100244998932, "learning_rate": 2.1197397086439495e-05, "loss": 0.7954, "num_input_tokens_seen": 22988528, "step": 39865 }, { "epoch": 5.938337801608579, "grad_norm": 0.18339502811431885, "learning_rate": 2.1190973989525377e-05, "loss": 0.8076, "num_input_tokens_seen": 22991344, "step": 39870 }, { "epoch": 5.939082514149538, "grad_norm": 0.3148353099822998, "learning_rate": 2.118455115000974e-05, "loss": 0.8206, "num_input_tokens_seen": 22994096, "step": 39875 }, { "epoch": 5.939827226690498, "grad_norm": 0.19067105650901794, "learning_rate": 2.117812856832663e-05, "loss": 0.7803, "num_input_tokens_seen": 22997040, "step": 39880 }, { "epoch": 5.940571939231456, "grad_norm": 0.22010241448879242, "learning_rate": 2.1171706244910055e-05, "loss": 0.8105, "num_input_tokens_seen": 22999824, "step": 39885 }, { "epoch": 5.941316651772416, "grad_norm": 0.19925160706043243, "learning_rate": 2.1165284180194003e-05, "loss": 0.7985, "num_input_tokens_seen": 23002896, "step": 39890 }, { "epoch": 5.942061364313375, "grad_norm": 0.20100386440753937, "learning_rate": 2.1158862374612465e-05, "loss": 0.7845, "num_input_tokens_seen": 23005616, "step": 39895 }, { "epoch": 5.9428060768543345, "grad_norm": 0.2698683440685272, "learning_rate": 2.1152440828599383e-05, "loss": 0.8162, "num_input_tokens_seen": 23008656, "step": 39900 }, { "epoch": 5.943550789395293, "grad_norm": 0.1552683562040329, "learning_rate": 2.114601954258871e-05, "loss": 0.8078, "num_input_tokens_seen": 23011952, "step": 39905 }, { "epoch": 5.944295501936253, "grad_norm": 0.2434520572423935, "learning_rate": 2.113959851701436e-05, "loss": 0.8073, "num_input_tokens_seen": 23015280, "step": 39910 }, { "epoch": 5.945040214477212, "grad_norm": 0.24393300712108612, "learning_rate": 2.1133177752310252e-05, "loss": 0.8089, "num_input_tokens_seen": 23018480, "step": 39915 }, { "epoch": 5.945784927018171, "grad_norm": 0.16030415892601013, "learning_rate": 2.112675724891027e-05, "loss": 0.8169, "num_input_tokens_seen": 23021168, "step": 39920 }, { "epoch": 5.94652963955913, "grad_norm": 0.21515853703022003, "learning_rate": 2.1120337007248284e-05, "loss": 0.8189, "num_input_tokens_seen": 23023952, "step": 39925 }, { "epoch": 5.94727435210009, "grad_norm": 0.18367183208465576, "learning_rate": 2.1113917027758145e-05, "loss": 0.8112, "num_input_tokens_seen": 23026800, "step": 39930 }, { "epoch": 5.9480190646410485, "grad_norm": 0.23015500605106354, "learning_rate": 2.1107497310873708e-05, "loss": 0.8046, "num_input_tokens_seen": 23029552, "step": 39935 }, { "epoch": 5.948763777182008, "grad_norm": 0.23737184703350067, "learning_rate": 2.1101077857028774e-05, "loss": 0.8232, "num_input_tokens_seen": 23032336, "step": 39940 }, { "epoch": 5.949508489722967, "grad_norm": 0.2006252110004425, "learning_rate": 2.1094658666657137e-05, "loss": 0.7951, "num_input_tokens_seen": 23035248, "step": 39945 }, { "epoch": 5.9502532022639265, "grad_norm": 0.18763212859630585, "learning_rate": 2.1088239740192588e-05, "loss": 0.7777, "num_input_tokens_seen": 23037904, "step": 39950 }, { "epoch": 5.950997914804885, "grad_norm": 0.15748310089111328, "learning_rate": 2.1081821078068902e-05, "loss": 0.7951, "num_input_tokens_seen": 23040688, "step": 39955 }, { "epoch": 5.951742627345844, "grad_norm": 0.1478578746318817, "learning_rate": 2.1075402680719814e-05, "loss": 0.8225, "num_input_tokens_seen": 23043632, "step": 39960 }, { "epoch": 5.952487339886804, "grad_norm": 0.18119925260543823, "learning_rate": 2.1068984548579053e-05, "loss": 0.7962, "num_input_tokens_seen": 23046160, "step": 39965 }, { "epoch": 5.953232052427763, "grad_norm": 0.1481836438179016, "learning_rate": 2.106256668208034e-05, "loss": 0.803, "num_input_tokens_seen": 23049008, "step": 39970 }, { "epoch": 5.953976764968722, "grad_norm": 0.19579853117465973, "learning_rate": 2.1056149081657368e-05, "loss": 0.8412, "num_input_tokens_seen": 23051792, "step": 39975 }, { "epoch": 5.954721477509681, "grad_norm": 0.18793457746505737, "learning_rate": 2.1049731747743793e-05, "loss": 0.7987, "num_input_tokens_seen": 23054768, "step": 39980 }, { "epoch": 5.9554661900506405, "grad_norm": 0.18895941972732544, "learning_rate": 2.104331468077329e-05, "loss": 0.7914, "num_input_tokens_seen": 23057680, "step": 39985 }, { "epoch": 5.9562109025916, "grad_norm": 0.2332078367471695, "learning_rate": 2.10368978811795e-05, "loss": 0.8054, "num_input_tokens_seen": 23060944, "step": 39990 }, { "epoch": 5.956955615132559, "grad_norm": 0.2017097920179367, "learning_rate": 2.1030481349396028e-05, "loss": 0.8063, "num_input_tokens_seen": 23063792, "step": 39995 }, { "epoch": 5.957700327673518, "grad_norm": 0.19092772901058197, "learning_rate": 2.1024065085856498e-05, "loss": 0.784, "num_input_tokens_seen": 23066896, "step": 40000 }, { "epoch": 5.958445040214477, "grad_norm": 0.23775066435337067, "learning_rate": 2.1017649090994477e-05, "loss": 0.8024, "num_input_tokens_seen": 23069712, "step": 40005 }, { "epoch": 5.959189752755436, "grad_norm": 0.28353992104530334, "learning_rate": 2.1011233365243538e-05, "loss": 0.8026, "num_input_tokens_seen": 23072624, "step": 40010 }, { "epoch": 5.959934465296396, "grad_norm": 0.22654612362384796, "learning_rate": 2.1004817909037245e-05, "loss": 0.8191, "num_input_tokens_seen": 23075504, "step": 40015 }, { "epoch": 5.9606791778373545, "grad_norm": 0.1762520968914032, "learning_rate": 2.0998402722809105e-05, "loss": 0.8064, "num_input_tokens_seen": 23078352, "step": 40020 }, { "epoch": 5.961423890378314, "grad_norm": 0.17013761401176453, "learning_rate": 2.0991987806992635e-05, "loss": 0.8038, "num_input_tokens_seen": 23081200, "step": 40025 }, { "epoch": 5.962168602919273, "grad_norm": 0.2427603304386139, "learning_rate": 2.0985573162021337e-05, "loss": 0.8129, "num_input_tokens_seen": 23083824, "step": 40030 }, { "epoch": 5.9629133154602325, "grad_norm": 0.23098516464233398, "learning_rate": 2.0979158788328684e-05, "loss": 0.7838, "num_input_tokens_seen": 23086928, "step": 40035 }, { "epoch": 5.963658028001191, "grad_norm": 0.1471862494945526, "learning_rate": 2.097274468634813e-05, "loss": 0.7929, "num_input_tokens_seen": 23089648, "step": 40040 }, { "epoch": 5.964402740542151, "grad_norm": 0.26379746198654175, "learning_rate": 2.0966330856513118e-05, "loss": 0.8049, "num_input_tokens_seen": 23092688, "step": 40045 }, { "epoch": 5.96514745308311, "grad_norm": 0.26859691739082336, "learning_rate": 2.095991729925707e-05, "loss": 0.8003, "num_input_tokens_seen": 23095824, "step": 40050 }, { "epoch": 5.965892165624069, "grad_norm": 0.3014923334121704, "learning_rate": 2.095350401501339e-05, "loss": 0.8332, "num_input_tokens_seen": 23098768, "step": 40055 }, { "epoch": 5.966636878165028, "grad_norm": 0.2155812531709671, "learning_rate": 2.094709100421545e-05, "loss": 0.7771, "num_input_tokens_seen": 23101488, "step": 40060 }, { "epoch": 5.967381590705988, "grad_norm": 0.2210961878299713, "learning_rate": 2.094067826729662e-05, "loss": 0.8032, "num_input_tokens_seen": 23104528, "step": 40065 }, { "epoch": 5.9681263032469465, "grad_norm": 0.2241598516702652, "learning_rate": 2.093426580469025e-05, "loss": 0.7946, "num_input_tokens_seen": 23107408, "step": 40070 }, { "epoch": 5.968871015787906, "grad_norm": 0.19506971538066864, "learning_rate": 2.0927853616829668e-05, "loss": 0.7953, "num_input_tokens_seen": 23109904, "step": 40075 }, { "epoch": 5.969615728328865, "grad_norm": 0.19510681927204132, "learning_rate": 2.0921441704148177e-05, "loss": 0.8054, "num_input_tokens_seen": 23112688, "step": 40080 }, { "epoch": 5.9703604408698245, "grad_norm": 0.1771862953901291, "learning_rate": 2.0915030067079084e-05, "loss": 0.7789, "num_input_tokens_seen": 23115568, "step": 40085 }, { "epoch": 5.971105153410783, "grad_norm": 0.1834549456834793, "learning_rate": 2.090861870605564e-05, "loss": 0.805, "num_input_tokens_seen": 23118256, "step": 40090 }, { "epoch": 5.971849865951743, "grad_norm": 0.20773397386074066, "learning_rate": 2.0902207621511123e-05, "loss": 0.811, "num_input_tokens_seen": 23121360, "step": 40095 }, { "epoch": 5.972594578492702, "grad_norm": 0.18880870938301086, "learning_rate": 2.0895796813878743e-05, "loss": 0.8109, "num_input_tokens_seen": 23124272, "step": 40100 }, { "epoch": 5.973339291033661, "grad_norm": 0.23395682871341705, "learning_rate": 2.0889386283591732e-05, "loss": 0.7899, "num_input_tokens_seen": 23127120, "step": 40105 }, { "epoch": 5.97408400357462, "grad_norm": 0.1715124398469925, "learning_rate": 2.088297603108328e-05, "loss": 0.8104, "num_input_tokens_seen": 23129968, "step": 40110 }, { "epoch": 5.97482871611558, "grad_norm": 0.17874448001384735, "learning_rate": 2.0876566056786572e-05, "loss": 0.8102, "num_input_tokens_seen": 23132784, "step": 40115 }, { "epoch": 5.9755734286565385, "grad_norm": 0.2583378851413727, "learning_rate": 2.087015636113477e-05, "loss": 0.8013, "num_input_tokens_seen": 23135792, "step": 40120 }, { "epoch": 5.976318141197497, "grad_norm": 0.22895729541778564, "learning_rate": 2.0863746944561e-05, "loss": 0.7799, "num_input_tokens_seen": 23138608, "step": 40125 }, { "epoch": 5.977062853738457, "grad_norm": 0.23610955476760864, "learning_rate": 2.0857337807498398e-05, "loss": 0.8166, "num_input_tokens_seen": 23141424, "step": 40130 }, { "epoch": 5.977807566279417, "grad_norm": 0.24716751277446747, "learning_rate": 2.085092895038007e-05, "loss": 0.8, "num_input_tokens_seen": 23144240, "step": 40135 }, { "epoch": 5.978552278820375, "grad_norm": 0.194234237074852, "learning_rate": 2.08445203736391e-05, "loss": 0.7885, "num_input_tokens_seen": 23147216, "step": 40140 }, { "epoch": 5.979296991361334, "grad_norm": 0.17617809772491455, "learning_rate": 2.0838112077708533e-05, "loss": 0.807, "num_input_tokens_seen": 23150160, "step": 40145 }, { "epoch": 5.980041703902294, "grad_norm": 0.3174739480018616, "learning_rate": 2.0831704063021433e-05, "loss": 0.8284, "num_input_tokens_seen": 23153104, "step": 40150 }, { "epoch": 5.980786416443253, "grad_norm": 0.21980884671211243, "learning_rate": 2.0825296330010834e-05, "loss": 0.7887, "num_input_tokens_seen": 23155984, "step": 40155 }, { "epoch": 5.981531128984212, "grad_norm": 0.2939242124557495, "learning_rate": 2.0818888879109728e-05, "loss": 0.8096, "num_input_tokens_seen": 23158896, "step": 40160 }, { "epoch": 5.982275841525171, "grad_norm": 0.2709137499332428, "learning_rate": 2.0812481710751115e-05, "loss": 0.8022, "num_input_tokens_seen": 23161904, "step": 40165 }, { "epoch": 5.9830205540661305, "grad_norm": 0.21454668045043945, "learning_rate": 2.0806074825367965e-05, "loss": 0.8277, "num_input_tokens_seen": 23164752, "step": 40170 }, { "epoch": 5.983765266607089, "grad_norm": 0.2237238585948944, "learning_rate": 2.079966822339322e-05, "loss": 0.8013, "num_input_tokens_seen": 23167600, "step": 40175 }, { "epoch": 5.984509979148049, "grad_norm": 0.21770931780338287, "learning_rate": 2.079326190525983e-05, "loss": 0.8251, "num_input_tokens_seen": 23170896, "step": 40180 }, { "epoch": 5.985254691689008, "grad_norm": 0.17557427287101746, "learning_rate": 2.0786855871400695e-05, "loss": 0.7816, "num_input_tokens_seen": 23173712, "step": 40185 }, { "epoch": 5.985999404229967, "grad_norm": 0.22301194071769714, "learning_rate": 2.0780450122248706e-05, "loss": 0.7955, "num_input_tokens_seen": 23176528, "step": 40190 }, { "epoch": 5.986744116770926, "grad_norm": 0.22266820073127747, "learning_rate": 2.0774044658236742e-05, "loss": 0.7802, "num_input_tokens_seen": 23179312, "step": 40195 }, { "epoch": 5.987488829311886, "grad_norm": 0.18224041163921356, "learning_rate": 2.0767639479797663e-05, "loss": 0.8007, "num_input_tokens_seen": 23182352, "step": 40200 }, { "epoch": 5.9882335418528445, "grad_norm": 0.2112981677055359, "learning_rate": 2.0761234587364294e-05, "loss": 0.7702, "num_input_tokens_seen": 23185424, "step": 40205 }, { "epoch": 5.988978254393804, "grad_norm": 0.2206491082906723, "learning_rate": 2.0754829981369458e-05, "loss": 0.8373, "num_input_tokens_seen": 23188368, "step": 40210 }, { "epoch": 5.989722966934763, "grad_norm": 0.2193613350391388, "learning_rate": 2.074842566224596e-05, "loss": 0.7781, "num_input_tokens_seen": 23191312, "step": 40215 }, { "epoch": 5.990467679475723, "grad_norm": 0.18110939860343933, "learning_rate": 2.074202163042657e-05, "loss": 0.7959, "num_input_tokens_seen": 23194448, "step": 40220 }, { "epoch": 5.991212392016681, "grad_norm": 0.2297353893518448, "learning_rate": 2.0735617886344043e-05, "loss": 0.8204, "num_input_tokens_seen": 23197104, "step": 40225 }, { "epoch": 5.991957104557641, "grad_norm": 0.2116047441959381, "learning_rate": 2.0729214430431118e-05, "loss": 0.8225, "num_input_tokens_seen": 23199792, "step": 40230 }, { "epoch": 5.9927018170986, "grad_norm": 0.30419352650642395, "learning_rate": 2.0722811263120523e-05, "loss": 0.8305, "num_input_tokens_seen": 23202736, "step": 40235 }, { "epoch": 5.993446529639559, "grad_norm": 0.23650820553302765, "learning_rate": 2.071640838484495e-05, "loss": 0.791, "num_input_tokens_seen": 23206032, "step": 40240 }, { "epoch": 5.994191242180518, "grad_norm": 0.2649073898792267, "learning_rate": 2.0710005796037078e-05, "loss": 0.8332, "num_input_tokens_seen": 23209168, "step": 40245 }, { "epoch": 5.994935954721478, "grad_norm": 0.23943468928337097, "learning_rate": 2.0703603497129584e-05, "loss": 0.7989, "num_input_tokens_seen": 23211952, "step": 40250 }, { "epoch": 5.9956806672624365, "grad_norm": 0.2678353488445282, "learning_rate": 2.0697201488555087e-05, "loss": 0.8023, "num_input_tokens_seen": 23214832, "step": 40255 }, { "epoch": 5.996425379803396, "grad_norm": 0.28269708156585693, "learning_rate": 2.0690799770746232e-05, "loss": 0.8052, "num_input_tokens_seen": 23217776, "step": 40260 }, { "epoch": 5.997170092344355, "grad_norm": 0.19044971466064453, "learning_rate": 2.06843983441356e-05, "loss": 0.8025, "num_input_tokens_seen": 23220560, "step": 40265 }, { "epoch": 5.997914804885315, "grad_norm": 0.15416282415390015, "learning_rate": 2.0677997209155785e-05, "loss": 0.8043, "num_input_tokens_seen": 23223056, "step": 40270 }, { "epoch": 5.998659517426273, "grad_norm": 0.22135327756404877, "learning_rate": 2.0671596366239343e-05, "loss": 0.8077, "num_input_tokens_seen": 23225872, "step": 40275 }, { "epoch": 5.999404229967233, "grad_norm": 0.23477447032928467, "learning_rate": 2.066519581581882e-05, "loss": 0.8133, "num_input_tokens_seen": 23228752, "step": 40280 }, { "epoch": 6.0, "eval_loss": 0.8047294020652771, "eval_runtime": 45.3395, "eval_samples_per_second": 65.815, "eval_steps_per_second": 16.454, "num_input_tokens_seen": 23230504, "step": 40284 }, { "epoch": 6.000148942508192, "grad_norm": 0.2531517744064331, "learning_rate": 2.0658795558326743e-05, "loss": 0.7977, "num_input_tokens_seen": 23231080, "step": 40285 }, { "epoch": 6.000893655049151, "grad_norm": 0.21045081317424774, "learning_rate": 2.065239559419561e-05, "loss": 0.8285, "num_input_tokens_seen": 23233672, "step": 40290 }, { "epoch": 6.00163836759011, "grad_norm": 0.22128233313560486, "learning_rate": 2.0645995923857902e-05, "loss": 0.7959, "num_input_tokens_seen": 23236488, "step": 40295 }, { "epoch": 6.00238308013107, "grad_norm": 0.25303056836128235, "learning_rate": 2.0639596547746104e-05, "loss": 0.832, "num_input_tokens_seen": 23239656, "step": 40300 }, { "epoch": 6.003127792672029, "grad_norm": 0.3112930655479431, "learning_rate": 2.0633197466292633e-05, "loss": 0.8227, "num_input_tokens_seen": 23242824, "step": 40305 }, { "epoch": 6.003872505212988, "grad_norm": 0.20743165910243988, "learning_rate": 2.062679867992992e-05, "loss": 0.7964, "num_input_tokens_seen": 23245704, "step": 40310 }, { "epoch": 6.004617217753947, "grad_norm": 0.2159266471862793, "learning_rate": 2.062040018909037e-05, "loss": 0.7853, "num_input_tokens_seen": 23248648, "step": 40315 }, { "epoch": 6.005361930294906, "grad_norm": 0.16715137660503387, "learning_rate": 2.0614001994206378e-05, "loss": 0.7996, "num_input_tokens_seen": 23251432, "step": 40320 }, { "epoch": 6.006106642835865, "grad_norm": 0.20796287059783936, "learning_rate": 2.060760409571029e-05, "loss": 0.8204, "num_input_tokens_seen": 23254312, "step": 40325 }, { "epoch": 6.006851355376824, "grad_norm": 0.4181634783744812, "learning_rate": 2.0601206494034465e-05, "loss": 0.8001, "num_input_tokens_seen": 23257352, "step": 40330 }, { "epoch": 6.007596067917784, "grad_norm": 0.1610421985387802, "learning_rate": 2.0594809189611218e-05, "loss": 0.7875, "num_input_tokens_seen": 23260264, "step": 40335 }, { "epoch": 6.0083407804587425, "grad_norm": 0.18138420581817627, "learning_rate": 2.058841218287287e-05, "loss": 0.7975, "num_input_tokens_seen": 23262920, "step": 40340 }, { "epoch": 6.009085492999702, "grad_norm": 0.20651671290397644, "learning_rate": 2.0582015474251672e-05, "loss": 0.798, "num_input_tokens_seen": 23265640, "step": 40345 }, { "epoch": 6.009830205540661, "grad_norm": 0.224612757563591, "learning_rate": 2.0575619064179912e-05, "loss": 0.822, "num_input_tokens_seen": 23268424, "step": 40350 }, { "epoch": 6.010574918081621, "grad_norm": 0.25990158319473267, "learning_rate": 2.0569222953089827e-05, "loss": 0.8074, "num_input_tokens_seen": 23271112, "step": 40355 }, { "epoch": 6.011319630622579, "grad_norm": 0.2322782427072525, "learning_rate": 2.0562827141413637e-05, "loss": 0.7936, "num_input_tokens_seen": 23274088, "step": 40360 }, { "epoch": 6.012064343163539, "grad_norm": 0.19005665183067322, "learning_rate": 2.0556431629583557e-05, "loss": 0.7995, "num_input_tokens_seen": 23277000, "step": 40365 }, { "epoch": 6.012809055704498, "grad_norm": 0.20081712305545807, "learning_rate": 2.0550036418031752e-05, "loss": 0.7844, "num_input_tokens_seen": 23279752, "step": 40370 }, { "epoch": 6.013553768245457, "grad_norm": 0.2350175529718399, "learning_rate": 2.0543641507190396e-05, "loss": 0.7847, "num_input_tokens_seen": 23282888, "step": 40375 }, { "epoch": 6.014298480786416, "grad_norm": 0.1855059564113617, "learning_rate": 2.0537246897491638e-05, "loss": 0.8063, "num_input_tokens_seen": 23285832, "step": 40380 }, { "epoch": 6.015043193327376, "grad_norm": 0.23541028797626495, "learning_rate": 2.0530852589367587e-05, "loss": 0.7807, "num_input_tokens_seen": 23288776, "step": 40385 }, { "epoch": 6.015787905868335, "grad_norm": 0.24849474430084229, "learning_rate": 2.052445858325034e-05, "loss": 0.7786, "num_input_tokens_seen": 23291816, "step": 40390 }, { "epoch": 6.016532618409294, "grad_norm": 0.18587027490139008, "learning_rate": 2.051806487957199e-05, "loss": 0.8174, "num_input_tokens_seen": 23294824, "step": 40395 }, { "epoch": 6.017277330950253, "grad_norm": 0.19593341648578644, "learning_rate": 2.0511671478764593e-05, "loss": 0.8292, "num_input_tokens_seen": 23297768, "step": 40400 }, { "epoch": 6.018022043491213, "grad_norm": 0.2786993384361267, "learning_rate": 2.0505278381260187e-05, "loss": 0.7849, "num_input_tokens_seen": 23300648, "step": 40405 }, { "epoch": 6.018766756032171, "grad_norm": 0.20128655433654785, "learning_rate": 2.0498885587490794e-05, "loss": 0.806, "num_input_tokens_seen": 23303720, "step": 40410 }, { "epoch": 6.019511468573131, "grad_norm": 0.17305190861225128, "learning_rate": 2.0492493097888414e-05, "loss": 0.8278, "num_input_tokens_seen": 23306536, "step": 40415 }, { "epoch": 6.02025618111409, "grad_norm": 0.1975318193435669, "learning_rate": 2.0486100912885036e-05, "loss": 0.8147, "num_input_tokens_seen": 23309384, "step": 40420 }, { "epoch": 6.021000893655049, "grad_norm": 0.18561339378356934, "learning_rate": 2.047970903291259e-05, "loss": 0.8068, "num_input_tokens_seen": 23312008, "step": 40425 }, { "epoch": 6.021745606196008, "grad_norm": 0.28140875697135925, "learning_rate": 2.0473317458403036e-05, "loss": 0.7976, "num_input_tokens_seen": 23314792, "step": 40430 }, { "epoch": 6.022490318736968, "grad_norm": 0.18768854439258575, "learning_rate": 2.0466926189788286e-05, "loss": 0.7858, "num_input_tokens_seen": 23317352, "step": 40435 }, { "epoch": 6.023235031277927, "grad_norm": 0.2596372365951538, "learning_rate": 2.0460535227500226e-05, "loss": 0.8153, "num_input_tokens_seen": 23320136, "step": 40440 }, { "epoch": 6.023979743818886, "grad_norm": 0.21921999752521515, "learning_rate": 2.045414457197074e-05, "loss": 0.8085, "num_input_tokens_seen": 23322856, "step": 40445 }, { "epoch": 6.024724456359845, "grad_norm": 0.23545725643634796, "learning_rate": 2.044775422363169e-05, "loss": 0.8009, "num_input_tokens_seen": 23325736, "step": 40450 }, { "epoch": 6.025469168900805, "grad_norm": 0.2695254981517792, "learning_rate": 2.0441364182914893e-05, "loss": 0.8204, "num_input_tokens_seen": 23328840, "step": 40455 }, { "epoch": 6.026213881441763, "grad_norm": 0.2618521749973297, "learning_rate": 2.0434974450252183e-05, "loss": 0.8134, "num_input_tokens_seen": 23331816, "step": 40460 }, { "epoch": 6.026958593982723, "grad_norm": 0.16047197580337524, "learning_rate": 2.042858502607533e-05, "loss": 0.7952, "num_input_tokens_seen": 23334504, "step": 40465 }, { "epoch": 6.027703306523682, "grad_norm": 0.1551608294248581, "learning_rate": 2.0422195910816116e-05, "loss": 0.7946, "num_input_tokens_seen": 23337192, "step": 40470 }, { "epoch": 6.0284480190646414, "grad_norm": 0.17928975820541382, "learning_rate": 2.041580710490629e-05, "loss": 0.8136, "num_input_tokens_seen": 23340104, "step": 40475 }, { "epoch": 6.0291927316056, "grad_norm": 0.2084992378950119, "learning_rate": 2.040941860877758e-05, "loss": 0.7805, "num_input_tokens_seen": 23342888, "step": 40480 }, { "epoch": 6.02993744414656, "grad_norm": 0.2157450169324875, "learning_rate": 2.04030304228617e-05, "loss": 0.8055, "num_input_tokens_seen": 23346024, "step": 40485 }, { "epoch": 6.030682156687519, "grad_norm": 0.225446417927742, "learning_rate": 2.039664254759033e-05, "loss": 0.8442, "num_input_tokens_seen": 23348744, "step": 40490 }, { "epoch": 6.031426869228477, "grad_norm": 0.21756762266159058, "learning_rate": 2.0390254983395146e-05, "loss": 0.7887, "num_input_tokens_seen": 23351720, "step": 40495 }, { "epoch": 6.032171581769437, "grad_norm": 0.32928451895713806, "learning_rate": 2.038386773070779e-05, "loss": 0.8012, "num_input_tokens_seen": 23354728, "step": 40500 }, { "epoch": 6.032916294310396, "grad_norm": 0.2953715920448303, "learning_rate": 2.0377480789959882e-05, "loss": 0.8087, "num_input_tokens_seen": 23357928, "step": 40505 }, { "epoch": 6.033661006851355, "grad_norm": 0.2054351270198822, "learning_rate": 2.0371094161583026e-05, "loss": 0.8199, "num_input_tokens_seen": 23360488, "step": 40510 }, { "epoch": 6.034405719392314, "grad_norm": 0.23268969357013702, "learning_rate": 2.036470784600881e-05, "loss": 0.8166, "num_input_tokens_seen": 23363240, "step": 40515 }, { "epoch": 6.035150431933274, "grad_norm": 0.22674529254436493, "learning_rate": 2.035832184366879e-05, "loss": 0.7911, "num_input_tokens_seen": 23366344, "step": 40520 }, { "epoch": 6.035895144474233, "grad_norm": 0.19531603157520294, "learning_rate": 2.0351936154994503e-05, "loss": 0.8128, "num_input_tokens_seen": 23369416, "step": 40525 }, { "epoch": 6.036639857015192, "grad_norm": 0.20485422015190125, "learning_rate": 2.034555078041748e-05, "loss": 0.8093, "num_input_tokens_seen": 23372168, "step": 40530 }, { "epoch": 6.037384569556151, "grad_norm": 0.23186294734477997, "learning_rate": 2.0339165720369207e-05, "loss": 0.7782, "num_input_tokens_seen": 23375336, "step": 40535 }, { "epoch": 6.038129282097111, "grad_norm": 0.22167326509952545, "learning_rate": 2.0332780975281177e-05, "loss": 0.8019, "num_input_tokens_seen": 23377864, "step": 40540 }, { "epoch": 6.038873994638069, "grad_norm": 0.25680670142173767, "learning_rate": 2.0326396545584822e-05, "loss": 0.8084, "num_input_tokens_seen": 23380840, "step": 40545 }, { "epoch": 6.039618707179029, "grad_norm": 0.20611202716827393, "learning_rate": 2.032001243171159e-05, "loss": 0.7892, "num_input_tokens_seen": 23383656, "step": 40550 }, { "epoch": 6.040363419719988, "grad_norm": 0.269845575094223, "learning_rate": 2.0313628634092887e-05, "loss": 0.8181, "num_input_tokens_seen": 23386792, "step": 40555 }, { "epoch": 6.0411081322609474, "grad_norm": 0.21191632747650146, "learning_rate": 2.030724515316011e-05, "loss": 0.7901, "num_input_tokens_seen": 23389480, "step": 40560 }, { "epoch": 6.041852844801906, "grad_norm": 0.23150323331356049, "learning_rate": 2.0300861989344627e-05, "loss": 0.7822, "num_input_tokens_seen": 23392424, "step": 40565 }, { "epoch": 6.042597557342866, "grad_norm": 0.19195324182510376, "learning_rate": 2.0294479143077783e-05, "loss": 0.8034, "num_input_tokens_seen": 23395336, "step": 40570 }, { "epoch": 6.043342269883825, "grad_norm": 0.20747675001621246, "learning_rate": 2.0288096614790905e-05, "loss": 0.841, "num_input_tokens_seen": 23398152, "step": 40575 }, { "epoch": 6.044086982424784, "grad_norm": 0.22774794697761536, "learning_rate": 2.0281714404915313e-05, "loss": 0.7858, "num_input_tokens_seen": 23401064, "step": 40580 }, { "epoch": 6.044831694965743, "grad_norm": 0.2035778909921646, "learning_rate": 2.027533251388227e-05, "loss": 0.7772, "num_input_tokens_seen": 23404008, "step": 40585 }, { "epoch": 6.045576407506703, "grad_norm": 0.2560085654258728, "learning_rate": 2.0268950942123046e-05, "loss": 0.7959, "num_input_tokens_seen": 23407144, "step": 40590 }, { "epoch": 6.046321120047661, "grad_norm": 0.1741660088300705, "learning_rate": 2.0262569690068882e-05, "loss": 0.7858, "num_input_tokens_seen": 23409832, "step": 40595 }, { "epoch": 6.047065832588621, "grad_norm": 0.23291946947574615, "learning_rate": 2.0256188758151e-05, "loss": 0.7805, "num_input_tokens_seen": 23412616, "step": 40600 }, { "epoch": 6.04781054512958, "grad_norm": 0.22446800768375397, "learning_rate": 2.024980814680059e-05, "loss": 0.8191, "num_input_tokens_seen": 23415272, "step": 40605 }, { "epoch": 6.0485552576705395, "grad_norm": 0.193806454539299, "learning_rate": 2.0243427856448834e-05, "loss": 0.8062, "num_input_tokens_seen": 23418280, "step": 40610 }, { "epoch": 6.049299970211498, "grad_norm": 0.34190496802330017, "learning_rate": 2.0237047887526887e-05, "loss": 0.8429, "num_input_tokens_seen": 23421192, "step": 40615 }, { "epoch": 6.050044682752458, "grad_norm": 0.2563111484050751, "learning_rate": 2.0230668240465886e-05, "loss": 0.8302, "num_input_tokens_seen": 23424072, "step": 40620 }, { "epoch": 6.050789395293417, "grad_norm": 0.26782625913619995, "learning_rate": 2.0224288915696924e-05, "loss": 0.8217, "num_input_tokens_seen": 23427080, "step": 40625 }, { "epoch": 6.051534107834376, "grad_norm": 0.22762715816497803, "learning_rate": 2.0217909913651102e-05, "loss": 0.7968, "num_input_tokens_seen": 23429928, "step": 40630 }, { "epoch": 6.052278820375335, "grad_norm": 0.1717921793460846, "learning_rate": 2.0211531234759487e-05, "loss": 0.7927, "num_input_tokens_seen": 23432456, "step": 40635 }, { "epoch": 6.053023532916295, "grad_norm": 0.23112165927886963, "learning_rate": 2.020515287945312e-05, "loss": 0.7944, "num_input_tokens_seen": 23435432, "step": 40640 }, { "epoch": 6.0537682454572534, "grad_norm": 0.2253200113773346, "learning_rate": 2.0198774848163027e-05, "loss": 0.801, "num_input_tokens_seen": 23438216, "step": 40645 }, { "epoch": 6.054512957998213, "grad_norm": 0.1755276620388031, "learning_rate": 2.0192397141320212e-05, "loss": 0.7966, "num_input_tokens_seen": 23441192, "step": 40650 }, { "epoch": 6.055257670539172, "grad_norm": 0.17313186824321747, "learning_rate": 2.018601975935565e-05, "loss": 0.8, "num_input_tokens_seen": 23444040, "step": 40655 }, { "epoch": 6.0560023830801315, "grad_norm": 0.24387022852897644, "learning_rate": 2.017964270270031e-05, "loss": 0.8096, "num_input_tokens_seen": 23446568, "step": 40660 }, { "epoch": 6.05674709562109, "grad_norm": 0.20047087967395782, "learning_rate": 2.0173265971785108e-05, "loss": 0.7864, "num_input_tokens_seen": 23449384, "step": 40665 }, { "epoch": 6.057491808162049, "grad_norm": 0.1975887417793274, "learning_rate": 2.0166889567040973e-05, "loss": 0.8103, "num_input_tokens_seen": 23452232, "step": 40670 }, { "epoch": 6.058236520703009, "grad_norm": 0.23092712461948395, "learning_rate": 2.0160513488898784e-05, "loss": 0.7849, "num_input_tokens_seen": 23455080, "step": 40675 }, { "epoch": 6.058981233243967, "grad_norm": 0.19036555290222168, "learning_rate": 2.0154137737789426e-05, "loss": 0.7873, "num_input_tokens_seen": 23457992, "step": 40680 }, { "epoch": 6.059725945784927, "grad_norm": 0.18594764173030853, "learning_rate": 2.0147762314143727e-05, "loss": 0.8219, "num_input_tokens_seen": 23460616, "step": 40685 }, { "epoch": 6.060470658325886, "grad_norm": 0.18477287888526917, "learning_rate": 2.0141387218392525e-05, "loss": 0.8079, "num_input_tokens_seen": 23463528, "step": 40690 }, { "epoch": 6.0612153708668455, "grad_norm": 0.27482348680496216, "learning_rate": 2.0135012450966632e-05, "loss": 0.8334, "num_input_tokens_seen": 23466664, "step": 40695 }, { "epoch": 6.061960083407804, "grad_norm": 0.2012270838022232, "learning_rate": 2.0128638012296817e-05, "loss": 0.811, "num_input_tokens_seen": 23469416, "step": 40700 }, { "epoch": 6.062704795948764, "grad_norm": 0.2250613421201706, "learning_rate": 2.0122263902813832e-05, "loss": 0.7893, "num_input_tokens_seen": 23472680, "step": 40705 }, { "epoch": 6.063449508489723, "grad_norm": 0.17608201503753662, "learning_rate": 2.011589012294842e-05, "loss": 0.8061, "num_input_tokens_seen": 23475688, "step": 40710 }, { "epoch": 6.064194221030682, "grad_norm": 0.20854414999485016, "learning_rate": 2.01095166731313e-05, "loss": 0.7843, "num_input_tokens_seen": 23478920, "step": 40715 }, { "epoch": 6.064938933571641, "grad_norm": 0.2860570549964905, "learning_rate": 2.0103143553793158e-05, "loss": 0.8162, "num_input_tokens_seen": 23481832, "step": 40720 }, { "epoch": 6.065683646112601, "grad_norm": 0.17516085505485535, "learning_rate": 2.0096770765364665e-05, "loss": 0.7898, "num_input_tokens_seen": 23484680, "step": 40725 }, { "epoch": 6.0664283586535594, "grad_norm": 0.15480831265449524, "learning_rate": 2.009039830827647e-05, "loss": 0.7992, "num_input_tokens_seen": 23487688, "step": 40730 }, { "epoch": 6.067173071194519, "grad_norm": 0.23003481328487396, "learning_rate": 2.0084026182959195e-05, "loss": 0.8109, "num_input_tokens_seen": 23490472, "step": 40735 }, { "epoch": 6.067917783735478, "grad_norm": 0.1739007979631424, "learning_rate": 2.0077654389843455e-05, "loss": 0.7763, "num_input_tokens_seen": 23493256, "step": 40740 }, { "epoch": 6.0686624962764375, "grad_norm": 0.16889218986034393, "learning_rate": 2.0071282929359802e-05, "loss": 0.8255, "num_input_tokens_seen": 23496072, "step": 40745 }, { "epoch": 6.069407208817396, "grad_norm": 0.21266168355941772, "learning_rate": 2.0064911801938822e-05, "loss": 0.8004, "num_input_tokens_seen": 23499048, "step": 40750 }, { "epoch": 6.070151921358356, "grad_norm": 0.20328232645988464, "learning_rate": 2.0058541008011028e-05, "loss": 0.8053, "num_input_tokens_seen": 23501736, "step": 40755 }, { "epoch": 6.070896633899315, "grad_norm": 0.1954110711812973, "learning_rate": 2.0052170548006944e-05, "loss": 0.8255, "num_input_tokens_seen": 23504584, "step": 40760 }, { "epoch": 6.071641346440274, "grad_norm": 0.16111186146736145, "learning_rate": 2.0045800422357066e-05, "loss": 0.7771, "num_input_tokens_seen": 23507272, "step": 40765 }, { "epoch": 6.072386058981233, "grad_norm": 0.38531890511512756, "learning_rate": 2.003943063149184e-05, "loss": 0.7994, "num_input_tokens_seen": 23510408, "step": 40770 }, { "epoch": 6.073130771522193, "grad_norm": 0.2454238086938858, "learning_rate": 2.003306117584173e-05, "loss": 0.8171, "num_input_tokens_seen": 23513192, "step": 40775 }, { "epoch": 6.0738754840631515, "grad_norm": 0.19400984048843384, "learning_rate": 2.0026692055837155e-05, "loss": 0.776, "num_input_tokens_seen": 23516200, "step": 40780 }, { "epoch": 6.074620196604111, "grad_norm": 0.2136971354484558, "learning_rate": 2.0020323271908518e-05, "loss": 0.8044, "num_input_tokens_seen": 23519080, "step": 40785 }, { "epoch": 6.07536490914507, "grad_norm": 0.27606314420700073, "learning_rate": 2.0013954824486176e-05, "loss": 0.8, "num_input_tokens_seen": 23521768, "step": 40790 }, { "epoch": 6.0761096216860295, "grad_norm": 0.20780141651630402, "learning_rate": 2.0007586714000497e-05, "loss": 0.8253, "num_input_tokens_seen": 23524712, "step": 40795 }, { "epoch": 6.076854334226988, "grad_norm": 0.2894483208656311, "learning_rate": 2.000121894088181e-05, "loss": 0.7777, "num_input_tokens_seen": 23527816, "step": 40800 }, { "epoch": 6.077599046767948, "grad_norm": 0.19797545671463013, "learning_rate": 1.9994851505560424e-05, "loss": 0.7932, "num_input_tokens_seen": 23530760, "step": 40805 }, { "epoch": 6.078343759308907, "grad_norm": 0.21357689797878265, "learning_rate": 1.9988484408466622e-05, "loss": 0.7972, "num_input_tokens_seen": 23533576, "step": 40810 }, { "epoch": 6.079088471849866, "grad_norm": 0.31137731671333313, "learning_rate": 1.9982117650030674e-05, "loss": 0.835, "num_input_tokens_seen": 23536712, "step": 40815 }, { "epoch": 6.079833184390825, "grad_norm": 0.2685387432575226, "learning_rate": 1.9975751230682808e-05, "loss": 0.8266, "num_input_tokens_seen": 23539592, "step": 40820 }, { "epoch": 6.080577896931785, "grad_norm": 0.27621015906333923, "learning_rate": 1.996938515085326e-05, "loss": 0.8075, "num_input_tokens_seen": 23542536, "step": 40825 }, { "epoch": 6.0813226094727435, "grad_norm": 0.2254568636417389, "learning_rate": 1.9963019410972194e-05, "loss": 0.8152, "num_input_tokens_seen": 23545320, "step": 40830 }, { "epoch": 6.082067322013703, "grad_norm": 0.1609010547399521, "learning_rate": 1.9956654011469808e-05, "loss": 0.7999, "num_input_tokens_seen": 23548008, "step": 40835 }, { "epoch": 6.082812034554662, "grad_norm": 0.1830732524394989, "learning_rate": 1.995028895277623e-05, "loss": 0.789, "num_input_tokens_seen": 23550728, "step": 40840 }, { "epoch": 6.083556747095621, "grad_norm": 0.1619318276643753, "learning_rate": 1.9943924235321605e-05, "loss": 0.8057, "num_input_tokens_seen": 23553768, "step": 40845 }, { "epoch": 6.08430145963658, "grad_norm": 0.18267419934272766, "learning_rate": 1.9937559859536016e-05, "loss": 0.8104, "num_input_tokens_seen": 23556488, "step": 40850 }, { "epoch": 6.085046172177539, "grad_norm": 0.18482060730457306, "learning_rate": 1.9931195825849544e-05, "loss": 0.7962, "num_input_tokens_seen": 23559048, "step": 40855 }, { "epoch": 6.085790884718499, "grad_norm": 0.13180284202098846, "learning_rate": 1.9924832134692262e-05, "loss": 0.803, "num_input_tokens_seen": 23561768, "step": 40860 }, { "epoch": 6.0865355972594575, "grad_norm": 0.166965052485466, "learning_rate": 1.991846878649419e-05, "loss": 0.8091, "num_input_tokens_seen": 23564680, "step": 40865 }, { "epoch": 6.087280309800417, "grad_norm": 0.2401343733072281, "learning_rate": 1.991210578168533e-05, "loss": 0.806, "num_input_tokens_seen": 23567656, "step": 40870 }, { "epoch": 6.088025022341376, "grad_norm": 0.22529293596744537, "learning_rate": 1.9905743120695675e-05, "loss": 0.8083, "num_input_tokens_seen": 23570888, "step": 40875 }, { "epoch": 6.0887697348823355, "grad_norm": 0.20362506806850433, "learning_rate": 1.9899380803955193e-05, "loss": 0.7914, "num_input_tokens_seen": 23573736, "step": 40880 }, { "epoch": 6.089514447423294, "grad_norm": 0.19547487795352936, "learning_rate": 1.9893018831893816e-05, "loss": 0.785, "num_input_tokens_seen": 23576392, "step": 40885 }, { "epoch": 6.090259159964254, "grad_norm": 0.17487040162086487, "learning_rate": 1.9886657204941458e-05, "loss": 0.7717, "num_input_tokens_seen": 23579176, "step": 40890 }, { "epoch": 6.091003872505213, "grad_norm": 0.31258195638656616, "learning_rate": 1.9880295923528025e-05, "loss": 0.823, "num_input_tokens_seen": 23582024, "step": 40895 }, { "epoch": 6.091748585046172, "grad_norm": 0.19259895384311676, "learning_rate": 1.9873934988083373e-05, "loss": 0.8112, "num_input_tokens_seen": 23584968, "step": 40900 }, { "epoch": 6.092493297587131, "grad_norm": 0.247184157371521, "learning_rate": 1.9867574399037365e-05, "loss": 0.7925, "num_input_tokens_seen": 23587976, "step": 40905 }, { "epoch": 6.093238010128091, "grad_norm": 0.1602070927619934, "learning_rate": 1.98612141568198e-05, "loss": 0.8148, "num_input_tokens_seen": 23590952, "step": 40910 }, { "epoch": 6.0939827226690495, "grad_norm": 0.19280098378658295, "learning_rate": 1.9854854261860496e-05, "loss": 0.7767, "num_input_tokens_seen": 23593736, "step": 40915 }, { "epoch": 6.094727435210009, "grad_norm": 0.3634085953235626, "learning_rate": 1.9848494714589214e-05, "loss": 0.8125, "num_input_tokens_seen": 23596584, "step": 40920 }, { "epoch": 6.095472147750968, "grad_norm": 0.18719425797462463, "learning_rate": 1.9842135515435717e-05, "loss": 0.8114, "num_input_tokens_seen": 23599464, "step": 40925 }, { "epoch": 6.0962168602919276, "grad_norm": 0.2795962393283844, "learning_rate": 1.9835776664829735e-05, "loss": 0.7918, "num_input_tokens_seen": 23602408, "step": 40930 }, { "epoch": 6.096961572832886, "grad_norm": 0.2213345617055893, "learning_rate": 1.9829418163200968e-05, "loss": 0.7744, "num_input_tokens_seen": 23605224, "step": 40935 }, { "epoch": 6.097706285373846, "grad_norm": 0.16355681419372559, "learning_rate": 1.9823060010979096e-05, "loss": 0.8092, "num_input_tokens_seen": 23608040, "step": 40940 }, { "epoch": 6.098450997914805, "grad_norm": 0.1564159393310547, "learning_rate": 1.9816702208593795e-05, "loss": 0.8091, "num_input_tokens_seen": 23611048, "step": 40945 }, { "epoch": 6.099195710455764, "grad_norm": 0.214519664645195, "learning_rate": 1.9810344756474676e-05, "loss": 0.8064, "num_input_tokens_seen": 23613736, "step": 40950 }, { "epoch": 6.099940422996723, "grad_norm": 0.24754899740219116, "learning_rate": 1.9803987655051354e-05, "loss": 0.8007, "num_input_tokens_seen": 23616648, "step": 40955 }, { "epoch": 6.100685135537683, "grad_norm": 0.24326607584953308, "learning_rate": 1.979763090475342e-05, "loss": 0.7958, "num_input_tokens_seen": 23619432, "step": 40960 }, { "epoch": 6.1014298480786415, "grad_norm": 0.2571834623813629, "learning_rate": 1.979127450601045e-05, "loss": 0.83, "num_input_tokens_seen": 23622280, "step": 40965 }, { "epoch": 6.102174560619601, "grad_norm": 0.22268328070640564, "learning_rate": 1.978491845925196e-05, "loss": 0.7968, "num_input_tokens_seen": 23625384, "step": 40970 }, { "epoch": 6.10291927316056, "grad_norm": 0.20132224261760712, "learning_rate": 1.9778562764907475e-05, "loss": 0.7973, "num_input_tokens_seen": 23628488, "step": 40975 }, { "epoch": 6.10366398570152, "grad_norm": 0.352387934923172, "learning_rate": 1.97722074234065e-05, "loss": 0.7613, "num_input_tokens_seen": 23631560, "step": 40980 }, { "epoch": 6.104408698242478, "grad_norm": 0.22568757832050323, "learning_rate": 1.97658524351785e-05, "loss": 0.7829, "num_input_tokens_seen": 23634568, "step": 40985 }, { "epoch": 6.105153410783438, "grad_norm": 0.26255324482917786, "learning_rate": 1.9759497800652897e-05, "loss": 0.7692, "num_input_tokens_seen": 23637416, "step": 40990 }, { "epoch": 6.105898123324397, "grad_norm": 0.23577812314033508, "learning_rate": 1.975314352025913e-05, "loss": 0.8085, "num_input_tokens_seen": 23640488, "step": 40995 }, { "epoch": 6.106642835865356, "grad_norm": 0.209344282746315, "learning_rate": 1.9746789594426593e-05, "loss": 0.8186, "num_input_tokens_seen": 23643848, "step": 41000 }, { "epoch": 6.107387548406315, "grad_norm": 0.2496105134487152, "learning_rate": 1.9740436023584653e-05, "loss": 0.817, "num_input_tokens_seen": 23646440, "step": 41005 }, { "epoch": 6.108132260947274, "grad_norm": 0.21650998294353485, "learning_rate": 1.973408280816267e-05, "loss": 0.7768, "num_input_tokens_seen": 23649224, "step": 41010 }, { "epoch": 6.1088769734882336, "grad_norm": 0.23497414588928223, "learning_rate": 1.9727729948589955e-05, "loss": 0.7888, "num_input_tokens_seen": 23652072, "step": 41015 }, { "epoch": 6.109621686029192, "grad_norm": 0.1659335494041443, "learning_rate": 1.9721377445295813e-05, "loss": 0.8186, "num_input_tokens_seen": 23654760, "step": 41020 }, { "epoch": 6.110366398570152, "grad_norm": 0.20362703502178192, "learning_rate": 1.9715025298709532e-05, "loss": 0.7856, "num_input_tokens_seen": 23657672, "step": 41025 }, { "epoch": 6.111111111111111, "grad_norm": 0.2239220291376114, "learning_rate": 1.970867350926035e-05, "loss": 0.8208, "num_input_tokens_seen": 23660456, "step": 41030 }, { "epoch": 6.11185582365207, "grad_norm": 0.20181262493133545, "learning_rate": 1.9702322077377493e-05, "loss": 0.8031, "num_input_tokens_seen": 23662888, "step": 41035 }, { "epoch": 6.112600536193029, "grad_norm": 0.2450651377439499, "learning_rate": 1.9695971003490175e-05, "loss": 0.7834, "num_input_tokens_seen": 23665800, "step": 41040 }, { "epoch": 6.113345248733989, "grad_norm": 0.2159179151058197, "learning_rate": 1.9689620288027574e-05, "loss": 0.7861, "num_input_tokens_seen": 23668616, "step": 41045 }, { "epoch": 6.1140899612749475, "grad_norm": 0.20410802960395813, "learning_rate": 1.9683269931418842e-05, "loss": 0.7899, "num_input_tokens_seen": 23671496, "step": 41050 }, { "epoch": 6.114834673815907, "grad_norm": 0.40798160433769226, "learning_rate": 1.9676919934093108e-05, "loss": 0.8038, "num_input_tokens_seen": 23674408, "step": 41055 }, { "epoch": 6.115579386356866, "grad_norm": 0.26339253783226013, "learning_rate": 1.9670570296479488e-05, "loss": 0.7978, "num_input_tokens_seen": 23677384, "step": 41060 }, { "epoch": 6.116324098897826, "grad_norm": 0.19808560609817505, "learning_rate": 1.9664221019007065e-05, "loss": 0.7908, "num_input_tokens_seen": 23680392, "step": 41065 }, { "epoch": 6.117068811438784, "grad_norm": 0.19016438722610474, "learning_rate": 1.9657872102104882e-05, "loss": 0.7993, "num_input_tokens_seen": 23682888, "step": 41070 }, { "epoch": 6.117813523979744, "grad_norm": 0.238264799118042, "learning_rate": 1.9651523546201982e-05, "loss": 0.8272, "num_input_tokens_seen": 23685736, "step": 41075 }, { "epoch": 6.118558236520703, "grad_norm": 0.19165582954883575, "learning_rate": 1.9645175351727383e-05, "loss": 0.8048, "num_input_tokens_seen": 23688872, "step": 41080 }, { "epoch": 6.119302949061662, "grad_norm": 0.2702161967754364, "learning_rate": 1.9638827519110057e-05, "loss": 0.8105, "num_input_tokens_seen": 23691912, "step": 41085 }, { "epoch": 6.120047661602621, "grad_norm": 0.18893073499202728, "learning_rate": 1.9632480048778968e-05, "loss": 0.7841, "num_input_tokens_seen": 23694600, "step": 41090 }, { "epoch": 6.120792374143581, "grad_norm": 0.2835462987422943, "learning_rate": 1.962613294116306e-05, "loss": 0.8031, "num_input_tokens_seen": 23697768, "step": 41095 }, { "epoch": 6.1215370866845396, "grad_norm": 0.21390441060066223, "learning_rate": 1.9619786196691238e-05, "loss": 0.7895, "num_input_tokens_seen": 23700424, "step": 41100 }, { "epoch": 6.122281799225499, "grad_norm": 0.20857566595077515, "learning_rate": 1.9613439815792394e-05, "loss": 0.7822, "num_input_tokens_seen": 23703528, "step": 41105 }, { "epoch": 6.123026511766458, "grad_norm": 0.32954418659210205, "learning_rate": 1.9607093798895382e-05, "loss": 0.7712, "num_input_tokens_seen": 23706440, "step": 41110 }, { "epoch": 6.123771224307418, "grad_norm": 0.21320085227489471, "learning_rate": 1.960074814642905e-05, "loss": 0.8327, "num_input_tokens_seen": 23709224, "step": 41115 }, { "epoch": 6.124515936848376, "grad_norm": 0.18924136459827423, "learning_rate": 1.95944028588222e-05, "loss": 0.7999, "num_input_tokens_seen": 23711848, "step": 41120 }, { "epoch": 6.125260649389336, "grad_norm": 0.16851471364498138, "learning_rate": 1.9588057936503627e-05, "loss": 0.8027, "num_input_tokens_seen": 23714888, "step": 41125 }, { "epoch": 6.126005361930295, "grad_norm": 0.262346476316452, "learning_rate": 1.95817133799021e-05, "loss": 0.8218, "num_input_tokens_seen": 23717576, "step": 41130 }, { "epoch": 6.126750074471254, "grad_norm": 0.3165964186191559, "learning_rate": 1.957536918944635e-05, "loss": 0.8024, "num_input_tokens_seen": 23720680, "step": 41135 }, { "epoch": 6.127494787012213, "grad_norm": 0.2815776467323303, "learning_rate": 1.9569025365565095e-05, "loss": 0.8049, "num_input_tokens_seen": 23723624, "step": 41140 }, { "epoch": 6.128239499553173, "grad_norm": 0.2557947039604187, "learning_rate": 1.9562681908687035e-05, "loss": 0.7891, "num_input_tokens_seen": 23726408, "step": 41145 }, { "epoch": 6.128984212094132, "grad_norm": 0.21667717397212982, "learning_rate": 1.955633881924082e-05, "loss": 0.8218, "num_input_tokens_seen": 23729064, "step": 41150 }, { "epoch": 6.129728924635091, "grad_norm": 0.26261159777641296, "learning_rate": 1.954999609765509e-05, "loss": 0.79, "num_input_tokens_seen": 23732008, "step": 41155 }, { "epoch": 6.13047363717605, "grad_norm": 0.20431216061115265, "learning_rate": 1.9543653744358465e-05, "loss": 0.8045, "num_input_tokens_seen": 23734920, "step": 41160 }, { "epoch": 6.13121834971701, "grad_norm": 0.15630467236042023, "learning_rate": 1.953731175977954e-05, "loss": 0.7837, "num_input_tokens_seen": 23737896, "step": 41165 }, { "epoch": 6.131963062257968, "grad_norm": 0.24026089906692505, "learning_rate": 1.9530970144346874e-05, "loss": 0.793, "num_input_tokens_seen": 23740712, "step": 41170 }, { "epoch": 6.132707774798928, "grad_norm": 0.2124781310558319, "learning_rate": 1.9524628898489016e-05, "loss": 0.8126, "num_input_tokens_seen": 23744200, "step": 41175 }, { "epoch": 6.133452487339887, "grad_norm": 0.2525143325328827, "learning_rate": 1.9518288022634468e-05, "loss": 0.7915, "num_input_tokens_seen": 23746984, "step": 41180 }, { "epoch": 6.134197199880846, "grad_norm": 0.23713922500610352, "learning_rate": 1.9511947517211742e-05, "loss": 0.794, "num_input_tokens_seen": 23750024, "step": 41185 }, { "epoch": 6.134941912421805, "grad_norm": 0.21654705703258514, "learning_rate": 1.9505607382649276e-05, "loss": 0.7775, "num_input_tokens_seen": 23753160, "step": 41190 }, { "epoch": 6.135686624962764, "grad_norm": 0.23623314499855042, "learning_rate": 1.9499267619375534e-05, "loss": 0.804, "num_input_tokens_seen": 23755976, "step": 41195 }, { "epoch": 6.136431337503724, "grad_norm": 0.24589642882347107, "learning_rate": 1.9492928227818914e-05, "loss": 0.8189, "num_input_tokens_seen": 23758888, "step": 41200 }, { "epoch": 6.137176050044682, "grad_norm": 0.24062949419021606, "learning_rate": 1.9486589208407812e-05, "loss": 0.784, "num_input_tokens_seen": 23761544, "step": 41205 }, { "epoch": 6.137920762585642, "grad_norm": 0.248845174908638, "learning_rate": 1.9480250561570603e-05, "loss": 0.7743, "num_input_tokens_seen": 23764488, "step": 41210 }, { "epoch": 6.138665475126601, "grad_norm": 0.27818265557289124, "learning_rate": 1.9473912287735614e-05, "loss": 0.7618, "num_input_tokens_seen": 23767304, "step": 41215 }, { "epoch": 6.13941018766756, "grad_norm": 0.25985419750213623, "learning_rate": 1.9467574387331167e-05, "loss": 0.7693, "num_input_tokens_seen": 23769800, "step": 41220 }, { "epoch": 6.140154900208519, "grad_norm": 0.2545009255409241, "learning_rate": 1.9461236860785558e-05, "loss": 0.7979, "num_input_tokens_seen": 23772680, "step": 41225 }, { "epoch": 6.140899612749479, "grad_norm": 0.20804497599601746, "learning_rate": 1.9454899708527038e-05, "loss": 0.8208, "num_input_tokens_seen": 23775400, "step": 41230 }, { "epoch": 6.141644325290438, "grad_norm": 0.19897578656673431, "learning_rate": 1.9448562930983848e-05, "loss": 0.8037, "num_input_tokens_seen": 23778216, "step": 41235 }, { "epoch": 6.142389037831397, "grad_norm": 0.24445970356464386, "learning_rate": 1.94422265285842e-05, "loss": 0.7834, "num_input_tokens_seen": 23781256, "step": 41240 }, { "epoch": 6.143133750372356, "grad_norm": 0.20945464074611664, "learning_rate": 1.9435890501756294e-05, "loss": 0.8279, "num_input_tokens_seen": 23783848, "step": 41245 }, { "epoch": 6.143878462913316, "grad_norm": 0.18105129897594452, "learning_rate": 1.9429554850928284e-05, "loss": 0.7771, "num_input_tokens_seen": 23786792, "step": 41250 }, { "epoch": 6.144623175454274, "grad_norm": 0.20076590776443481, "learning_rate": 1.9423219576528306e-05, "loss": 0.8189, "num_input_tokens_seen": 23789832, "step": 41255 }, { "epoch": 6.145367887995234, "grad_norm": 0.22335940599441528, "learning_rate": 1.941688467898448e-05, "loss": 0.8035, "num_input_tokens_seen": 23792680, "step": 41260 }, { "epoch": 6.146112600536193, "grad_norm": 0.2240612953901291, "learning_rate": 1.9410550158724898e-05, "loss": 0.7878, "num_input_tokens_seen": 23795624, "step": 41265 }, { "epoch": 6.146857313077152, "grad_norm": 0.2712332010269165, "learning_rate": 1.9404216016177594e-05, "loss": 0.8442, "num_input_tokens_seen": 23798472, "step": 41270 }, { "epoch": 6.147602025618111, "grad_norm": 0.26315897703170776, "learning_rate": 1.9397882251770627e-05, "loss": 0.8208, "num_input_tokens_seen": 23801864, "step": 41275 }, { "epoch": 6.148346738159071, "grad_norm": 0.25844064354896545, "learning_rate": 1.9391548865932e-05, "loss": 0.7785, "num_input_tokens_seen": 23804616, "step": 41280 }, { "epoch": 6.14909145070003, "grad_norm": 0.21241897344589233, "learning_rate": 1.9385215859089702e-05, "loss": 0.8, "num_input_tokens_seen": 23807304, "step": 41285 }, { "epoch": 6.149836163240989, "grad_norm": 0.2665224075317383, "learning_rate": 1.937888323167168e-05, "loss": 0.7896, "num_input_tokens_seen": 23810216, "step": 41290 }, { "epoch": 6.150580875781948, "grad_norm": 0.22563053667545319, "learning_rate": 1.9372550984105885e-05, "loss": 0.8156, "num_input_tokens_seen": 23813032, "step": 41295 }, { "epoch": 6.151325588322908, "grad_norm": 0.2746976912021637, "learning_rate": 1.9366219116820205e-05, "loss": 0.8199, "num_input_tokens_seen": 23815752, "step": 41300 }, { "epoch": 6.152070300863866, "grad_norm": 0.203956738114357, "learning_rate": 1.9359887630242547e-05, "loss": 0.8071, "num_input_tokens_seen": 23818664, "step": 41305 }, { "epoch": 6.152815013404826, "grad_norm": 0.24019433557987213, "learning_rate": 1.9353556524800743e-05, "loss": 0.8172, "num_input_tokens_seen": 23821576, "step": 41310 }, { "epoch": 6.153559725945785, "grad_norm": 0.17277759313583374, "learning_rate": 1.934722580092263e-05, "loss": 0.7762, "num_input_tokens_seen": 23824584, "step": 41315 }, { "epoch": 6.1543044384867445, "grad_norm": 0.213987797498703, "learning_rate": 1.9340895459036014e-05, "loss": 0.8143, "num_input_tokens_seen": 23827336, "step": 41320 }, { "epoch": 6.155049151027703, "grad_norm": 0.1704380214214325, "learning_rate": 1.9334565499568676e-05, "loss": 0.7994, "num_input_tokens_seen": 23830056, "step": 41325 }, { "epoch": 6.155793863568663, "grad_norm": 0.2104542851448059, "learning_rate": 1.9328235922948375e-05, "loss": 0.777, "num_input_tokens_seen": 23832712, "step": 41330 }, { "epoch": 6.156538576109622, "grad_norm": 0.20845866203308105, "learning_rate": 1.932190672960282e-05, "loss": 0.7748, "num_input_tokens_seen": 23835720, "step": 41335 }, { "epoch": 6.157283288650581, "grad_norm": 0.1696677803993225, "learning_rate": 1.931557791995973e-05, "loss": 0.8281, "num_input_tokens_seen": 23838472, "step": 41340 }, { "epoch": 6.15802800119154, "grad_norm": 0.2628965973854065, "learning_rate": 1.930924949444677e-05, "loss": 0.7869, "num_input_tokens_seen": 23841192, "step": 41345 }, { "epoch": 6.1587727137325, "grad_norm": 0.2430456131696701, "learning_rate": 1.9302921453491596e-05, "loss": 0.8137, "num_input_tokens_seen": 23844072, "step": 41350 }, { "epoch": 6.159517426273458, "grad_norm": 0.18073026835918427, "learning_rate": 1.9296593797521823e-05, "loss": 0.8047, "num_input_tokens_seen": 23846952, "step": 41355 }, { "epoch": 6.160262138814417, "grad_norm": 0.27277249097824097, "learning_rate": 1.9290266526965056e-05, "loss": 0.7987, "num_input_tokens_seen": 23849928, "step": 41360 }, { "epoch": 6.161006851355377, "grad_norm": 0.3438785672187805, "learning_rate": 1.9283939642248858e-05, "loss": 0.8144, "num_input_tokens_seen": 23852776, "step": 41365 }, { "epoch": 6.161751563896336, "grad_norm": 0.23293563723564148, "learning_rate": 1.927761314380078e-05, "loss": 0.8057, "num_input_tokens_seen": 23856008, "step": 41370 }, { "epoch": 6.162496276437295, "grad_norm": 0.19682049751281738, "learning_rate": 1.9271287032048343e-05, "loss": 0.7774, "num_input_tokens_seen": 23858632, "step": 41375 }, { "epoch": 6.163240988978254, "grad_norm": 0.25990915298461914, "learning_rate": 1.9264961307419037e-05, "loss": 0.784, "num_input_tokens_seen": 23861512, "step": 41380 }, { "epoch": 6.163985701519214, "grad_norm": 0.23181116580963135, "learning_rate": 1.9258635970340326e-05, "loss": 0.7993, "num_input_tokens_seen": 23864328, "step": 41385 }, { "epoch": 6.164730414060172, "grad_norm": 0.15563207864761353, "learning_rate": 1.925231102123966e-05, "loss": 0.7974, "num_input_tokens_seen": 23867368, "step": 41390 }, { "epoch": 6.165475126601132, "grad_norm": 0.2835780680179596, "learning_rate": 1.9245986460544448e-05, "loss": 0.8284, "num_input_tokens_seen": 23870248, "step": 41395 }, { "epoch": 6.166219839142091, "grad_norm": 0.15440183877944946, "learning_rate": 1.9239662288682067e-05, "loss": 0.8029, "num_input_tokens_seen": 23872872, "step": 41400 }, { "epoch": 6.1669645516830505, "grad_norm": 0.17377081513404846, "learning_rate": 1.923333850607989e-05, "loss": 0.8014, "num_input_tokens_seen": 23875784, "step": 41405 }, { "epoch": 6.167709264224009, "grad_norm": 0.2692713141441345, "learning_rate": 1.922701511316526e-05, "loss": 0.821, "num_input_tokens_seen": 23878632, "step": 41410 }, { "epoch": 6.168453976764969, "grad_norm": 0.23296459019184113, "learning_rate": 1.9220692110365468e-05, "loss": 0.8144, "num_input_tokens_seen": 23881832, "step": 41415 }, { "epoch": 6.169198689305928, "grad_norm": 0.21391993761062622, "learning_rate": 1.9214369498107806e-05, "loss": 0.801, "num_input_tokens_seen": 23885224, "step": 41420 }, { "epoch": 6.169943401846887, "grad_norm": 0.21792520582675934, "learning_rate": 1.9208047276819537e-05, "loss": 0.8183, "num_input_tokens_seen": 23888136, "step": 41425 }, { "epoch": 6.170688114387846, "grad_norm": 0.21863387525081635, "learning_rate": 1.920172544692789e-05, "loss": 0.8061, "num_input_tokens_seen": 23891112, "step": 41430 }, { "epoch": 6.171432826928806, "grad_norm": 0.18333399295806885, "learning_rate": 1.9195404008860053e-05, "loss": 0.823, "num_input_tokens_seen": 23893896, "step": 41435 }, { "epoch": 6.172177539469764, "grad_norm": 0.139284148812294, "learning_rate": 1.9189082963043213e-05, "loss": 0.7939, "num_input_tokens_seen": 23896648, "step": 41440 }, { "epoch": 6.172922252010724, "grad_norm": 0.20746347308158875, "learning_rate": 1.918276230990453e-05, "loss": 0.7785, "num_input_tokens_seen": 23899624, "step": 41445 }, { "epoch": 6.173666964551683, "grad_norm": 0.1983267068862915, "learning_rate": 1.9176442049871108e-05, "loss": 0.7816, "num_input_tokens_seen": 23902504, "step": 41450 }, { "epoch": 6.1744116770926425, "grad_norm": 0.4168205261230469, "learning_rate": 1.9170122183370058e-05, "loss": 0.8323, "num_input_tokens_seen": 23905352, "step": 41455 }, { "epoch": 6.175156389633601, "grad_norm": 0.18035419285297394, "learning_rate": 1.9163802710828453e-05, "loss": 0.8053, "num_input_tokens_seen": 23908552, "step": 41460 }, { "epoch": 6.175901102174561, "grad_norm": 0.265421599149704, "learning_rate": 1.9157483632673328e-05, "loss": 0.8247, "num_input_tokens_seen": 23911432, "step": 41465 }, { "epoch": 6.17664581471552, "grad_norm": 0.23638704419136047, "learning_rate": 1.9151164949331714e-05, "loss": 0.835, "num_input_tokens_seen": 23913992, "step": 41470 }, { "epoch": 6.177390527256479, "grad_norm": 0.3284493088722229, "learning_rate": 1.914484666123058e-05, "loss": 0.7982, "num_input_tokens_seen": 23917224, "step": 41475 }, { "epoch": 6.178135239797438, "grad_norm": 0.2024184763431549, "learning_rate": 1.9138528768796915e-05, "loss": 0.7923, "num_input_tokens_seen": 23919880, "step": 41480 }, { "epoch": 6.178879952338398, "grad_norm": 0.24817518889904022, "learning_rate": 1.9132211272457634e-05, "loss": 0.8102, "num_input_tokens_seen": 23922664, "step": 41485 }, { "epoch": 6.1796246648793565, "grad_norm": 0.1929176151752472, "learning_rate": 1.9125894172639663e-05, "loss": 0.7879, "num_input_tokens_seen": 23925576, "step": 41490 }, { "epoch": 6.180369377420316, "grad_norm": 0.22145697474479675, "learning_rate": 1.9119577469769883e-05, "loss": 0.7882, "num_input_tokens_seen": 23928456, "step": 41495 }, { "epoch": 6.181114089961275, "grad_norm": 0.20061476528644562, "learning_rate": 1.9113261164275147e-05, "loss": 0.7946, "num_input_tokens_seen": 23931432, "step": 41500 }, { "epoch": 6.1818588025022345, "grad_norm": 0.22828514873981476, "learning_rate": 1.9106945256582293e-05, "loss": 0.8013, "num_input_tokens_seen": 23934184, "step": 41505 }, { "epoch": 6.182603515043193, "grad_norm": 0.2198648303747177, "learning_rate": 1.9100629747118117e-05, "loss": 0.8147, "num_input_tokens_seen": 23937128, "step": 41510 }, { "epoch": 6.183348227584153, "grad_norm": 0.20949727296829224, "learning_rate": 1.90943146363094e-05, "loss": 0.8151, "num_input_tokens_seen": 23939816, "step": 41515 }, { "epoch": 6.184092940125112, "grad_norm": 0.24384891986846924, "learning_rate": 1.9087999924582884e-05, "loss": 0.8108, "num_input_tokens_seen": 23942888, "step": 41520 }, { "epoch": 6.18483765266607, "grad_norm": 0.23739288747310638, "learning_rate": 1.9081685612365298e-05, "loss": 0.7874, "num_input_tokens_seen": 23945896, "step": 41525 }, { "epoch": 6.18558236520703, "grad_norm": 0.2573300302028656, "learning_rate": 1.9075371700083333e-05, "loss": 0.798, "num_input_tokens_seen": 23948712, "step": 41530 }, { "epoch": 6.18632707774799, "grad_norm": 0.23883813619613647, "learning_rate": 1.906905818816366e-05, "loss": 0.8154, "num_input_tokens_seen": 23951688, "step": 41535 }, { "epoch": 6.1870717902889485, "grad_norm": 0.19819581508636475, "learning_rate": 1.906274507703293e-05, "loss": 0.7932, "num_input_tokens_seen": 23954728, "step": 41540 }, { "epoch": 6.187816502829907, "grad_norm": 0.2492084950208664, "learning_rate": 1.9056432367117744e-05, "loss": 0.7994, "num_input_tokens_seen": 23957448, "step": 41545 }, { "epoch": 6.188561215370867, "grad_norm": 0.19888487458229065, "learning_rate": 1.90501200588447e-05, "loss": 0.782, "num_input_tokens_seen": 23960328, "step": 41550 }, { "epoch": 6.189305927911826, "grad_norm": 0.16495470702648163, "learning_rate": 1.9043808152640342e-05, "loss": 0.8105, "num_input_tokens_seen": 23963240, "step": 41555 }, { "epoch": 6.190050640452785, "grad_norm": 0.18542933464050293, "learning_rate": 1.903749664893122e-05, "loss": 0.8186, "num_input_tokens_seen": 23966344, "step": 41560 }, { "epoch": 6.190795352993744, "grad_norm": 0.20421411097049713, "learning_rate": 1.9031185548143827e-05, "loss": 0.8278, "num_input_tokens_seen": 23968904, "step": 41565 }, { "epoch": 6.191540065534704, "grad_norm": 0.28333550691604614, "learning_rate": 1.9024874850704646e-05, "loss": 0.8026, "num_input_tokens_seen": 23972200, "step": 41570 }, { "epoch": 6.1922847780756625, "grad_norm": 0.21852050721645355, "learning_rate": 1.9018564557040135e-05, "loss": 0.8125, "num_input_tokens_seen": 23974760, "step": 41575 }, { "epoch": 6.193029490616622, "grad_norm": 0.23523136973381042, "learning_rate": 1.9012254667576707e-05, "loss": 0.7952, "num_input_tokens_seen": 23977896, "step": 41580 }, { "epoch": 6.193774203157581, "grad_norm": 0.3445364236831665, "learning_rate": 1.9005945182740765e-05, "loss": 0.7872, "num_input_tokens_seen": 23980872, "step": 41585 }, { "epoch": 6.1945189156985405, "grad_norm": 0.19018687307834625, "learning_rate": 1.899963610295869e-05, "loss": 0.8146, "num_input_tokens_seen": 23983592, "step": 41590 }, { "epoch": 6.195263628239499, "grad_norm": 0.19466254115104675, "learning_rate": 1.8993327428656805e-05, "loss": 0.7834, "num_input_tokens_seen": 23986184, "step": 41595 }, { "epoch": 6.196008340780459, "grad_norm": 0.2293253391981125, "learning_rate": 1.898701916026142e-05, "loss": 0.805, "num_input_tokens_seen": 23989032, "step": 41600 }, { "epoch": 6.196753053321418, "grad_norm": 0.23659232258796692, "learning_rate": 1.8980711298198843e-05, "loss": 0.8095, "num_input_tokens_seen": 23992328, "step": 41605 }, { "epoch": 6.197497765862377, "grad_norm": 0.2044655978679657, "learning_rate": 1.897440384289532e-05, "loss": 0.8051, "num_input_tokens_seen": 23995368, "step": 41610 }, { "epoch": 6.198242478403336, "grad_norm": 0.13788509368896484, "learning_rate": 1.8968096794777087e-05, "loss": 0.796, "num_input_tokens_seen": 23998120, "step": 41615 }, { "epoch": 6.198987190944296, "grad_norm": 0.2700219452381134, "learning_rate": 1.896179015427035e-05, "loss": 0.8109, "num_input_tokens_seen": 24000840, "step": 41620 }, { "epoch": 6.1997319034852545, "grad_norm": 0.23884838819503784, "learning_rate": 1.8955483921801286e-05, "loss": 0.7842, "num_input_tokens_seen": 24003592, "step": 41625 }, { "epoch": 6.200476616026214, "grad_norm": 0.2582656145095825, "learning_rate": 1.894917809779605e-05, "loss": 0.7939, "num_input_tokens_seen": 24006408, "step": 41630 }, { "epoch": 6.201221328567173, "grad_norm": 0.1589728593826294, "learning_rate": 1.8942872682680747e-05, "loss": 0.8109, "num_input_tokens_seen": 24009160, "step": 41635 }, { "epoch": 6.2019660411081325, "grad_norm": 0.20366910099983215, "learning_rate": 1.893656767688148e-05, "loss": 0.8072, "num_input_tokens_seen": 24011912, "step": 41640 }, { "epoch": 6.202710753649091, "grad_norm": 0.26511186361312866, "learning_rate": 1.8930263080824327e-05, "loss": 0.7784, "num_input_tokens_seen": 24015144, "step": 41645 }, { "epoch": 6.203455466190051, "grad_norm": 0.18220561742782593, "learning_rate": 1.892395889493531e-05, "loss": 0.7809, "num_input_tokens_seen": 24018184, "step": 41650 }, { "epoch": 6.20420017873101, "grad_norm": 0.19733573496341705, "learning_rate": 1.8917655119640446e-05, "loss": 0.8411, "num_input_tokens_seen": 24020872, "step": 41655 }, { "epoch": 6.204944891271969, "grad_norm": 0.2416718304157257, "learning_rate": 1.8911351755365726e-05, "loss": 0.7838, "num_input_tokens_seen": 24024136, "step": 41660 }, { "epoch": 6.205689603812928, "grad_norm": 0.19593055546283722, "learning_rate": 1.890504880253709e-05, "loss": 0.7903, "num_input_tokens_seen": 24027240, "step": 41665 }, { "epoch": 6.206434316353888, "grad_norm": 0.20093414187431335, "learning_rate": 1.8898746261580493e-05, "loss": 0.7968, "num_input_tokens_seen": 24029928, "step": 41670 }, { "epoch": 6.2071790288948465, "grad_norm": 0.18565766513347626, "learning_rate": 1.8892444132921803e-05, "loss": 0.8062, "num_input_tokens_seen": 24032936, "step": 41675 }, { "epoch": 6.207923741435806, "grad_norm": 0.22786878049373627, "learning_rate": 1.8886142416986917e-05, "loss": 0.8245, "num_input_tokens_seen": 24035880, "step": 41680 }, { "epoch": 6.208668453976765, "grad_norm": 0.26164311170578003, "learning_rate": 1.887984111420166e-05, "loss": 0.8074, "num_input_tokens_seen": 24038888, "step": 41685 }, { "epoch": 6.209413166517725, "grad_norm": 0.197305366396904, "learning_rate": 1.8873540224991864e-05, "loss": 0.8041, "num_input_tokens_seen": 24041608, "step": 41690 }, { "epoch": 6.210157879058683, "grad_norm": 0.20365798473358154, "learning_rate": 1.8867239749783307e-05, "loss": 0.8064, "num_input_tokens_seen": 24044456, "step": 41695 }, { "epoch": 6.210902591599643, "grad_norm": 0.21771284937858582, "learning_rate": 1.8860939689001754e-05, "loss": 0.7969, "num_input_tokens_seen": 24047176, "step": 41700 }, { "epoch": 6.211647304140602, "grad_norm": 0.1758827418088913, "learning_rate": 1.885464004307294e-05, "loss": 0.7923, "num_input_tokens_seen": 24050184, "step": 41705 }, { "epoch": 6.2123920166815605, "grad_norm": 0.14060090482234955, "learning_rate": 1.8848340812422574e-05, "loss": 0.8134, "num_input_tokens_seen": 24053000, "step": 41710 }, { "epoch": 6.21313672922252, "grad_norm": 0.21503683924674988, "learning_rate": 1.884204199747631e-05, "loss": 0.8228, "num_input_tokens_seen": 24055976, "step": 41715 }, { "epoch": 6.213881441763479, "grad_norm": 0.22599440813064575, "learning_rate": 1.8835743598659815e-05, "loss": 0.7962, "num_input_tokens_seen": 24058696, "step": 41720 }, { "epoch": 6.2146261543044385, "grad_norm": 0.23403872549533844, "learning_rate": 1.8829445616398713e-05, "loss": 0.8014, "num_input_tokens_seen": 24061736, "step": 41725 }, { "epoch": 6.215370866845397, "grad_norm": 0.21619993448257446, "learning_rate": 1.8823148051118585e-05, "loss": 0.81, "num_input_tokens_seen": 24064808, "step": 41730 }, { "epoch": 6.216115579386357, "grad_norm": 0.19988678395748138, "learning_rate": 1.8816850903244994e-05, "loss": 0.8083, "num_input_tokens_seen": 24067752, "step": 41735 }, { "epoch": 6.216860291927316, "grad_norm": 0.2672960162162781, "learning_rate": 1.8810554173203486e-05, "loss": 0.8267, "num_input_tokens_seen": 24070408, "step": 41740 }, { "epoch": 6.217605004468275, "grad_norm": 0.15127314627170563, "learning_rate": 1.8804257861419556e-05, "loss": 0.7981, "num_input_tokens_seen": 24073256, "step": 41745 }, { "epoch": 6.218349717009234, "grad_norm": 0.23386172950267792, "learning_rate": 1.87979619683187e-05, "loss": 0.785, "num_input_tokens_seen": 24076424, "step": 41750 }, { "epoch": 6.219094429550194, "grad_norm": 0.1325373500585556, "learning_rate": 1.8791666494326353e-05, "loss": 0.8194, "num_input_tokens_seen": 24079400, "step": 41755 }, { "epoch": 6.2198391420911525, "grad_norm": 0.21872244775295258, "learning_rate": 1.8785371439867945e-05, "loss": 0.8106, "num_input_tokens_seen": 24081960, "step": 41760 }, { "epoch": 6.220583854632112, "grad_norm": 0.2688397169113159, "learning_rate": 1.8779076805368862e-05, "loss": 0.8165, "num_input_tokens_seen": 24085000, "step": 41765 }, { "epoch": 6.221328567173071, "grad_norm": 0.29253485798835754, "learning_rate": 1.8772782591254474e-05, "loss": 0.8005, "num_input_tokens_seen": 24087848, "step": 41770 }, { "epoch": 6.222073279714031, "grad_norm": 0.22067564725875854, "learning_rate": 1.876648879795013e-05, "loss": 0.8036, "num_input_tokens_seen": 24090792, "step": 41775 }, { "epoch": 6.222817992254989, "grad_norm": 0.1921635866165161, "learning_rate": 1.8760195425881122e-05, "loss": 0.7958, "num_input_tokens_seen": 24093640, "step": 41780 }, { "epoch": 6.223562704795949, "grad_norm": 0.21326954662799835, "learning_rate": 1.8753902475472738e-05, "loss": 0.7954, "num_input_tokens_seen": 24096776, "step": 41785 }, { "epoch": 6.224307417336908, "grad_norm": 0.16874979436397552, "learning_rate": 1.874760994715024e-05, "loss": 0.8227, "num_input_tokens_seen": 24099624, "step": 41790 }, { "epoch": 6.225052129877867, "grad_norm": 0.2590326964855194, "learning_rate": 1.874131784133884e-05, "loss": 0.7904, "num_input_tokens_seen": 24102600, "step": 41795 }, { "epoch": 6.225796842418826, "grad_norm": 0.20722676813602448, "learning_rate": 1.8735026158463724e-05, "loss": 0.7996, "num_input_tokens_seen": 24105512, "step": 41800 }, { "epoch": 6.226541554959786, "grad_norm": 0.21260032057762146, "learning_rate": 1.8728734898950072e-05, "loss": 0.797, "num_input_tokens_seen": 24108264, "step": 41805 }, { "epoch": 6.2272862675007445, "grad_norm": 0.21609100699424744, "learning_rate": 1.8722444063223023e-05, "loss": 0.8094, "num_input_tokens_seen": 24111016, "step": 41810 }, { "epoch": 6.228030980041704, "grad_norm": 0.14905451238155365, "learning_rate": 1.871615365170768e-05, "loss": 0.8039, "num_input_tokens_seen": 24113832, "step": 41815 }, { "epoch": 6.228775692582663, "grad_norm": 0.2649911046028137, "learning_rate": 1.870986366482912e-05, "loss": 0.7855, "num_input_tokens_seen": 24117032, "step": 41820 }, { "epoch": 6.229520405123623, "grad_norm": 0.1695600003004074, "learning_rate": 1.8703574103012407e-05, "loss": 0.813, "num_input_tokens_seen": 24119784, "step": 41825 }, { "epoch": 6.230265117664581, "grad_norm": 0.3392620086669922, "learning_rate": 1.869728496668256e-05, "loss": 0.7763, "num_input_tokens_seen": 24123464, "step": 41830 }, { "epoch": 6.231009830205541, "grad_norm": 0.2071155458688736, "learning_rate": 1.869099625626456e-05, "loss": 0.8211, "num_input_tokens_seen": 24126056, "step": 41835 }, { "epoch": 6.2317545427465, "grad_norm": 0.37848028540611267, "learning_rate": 1.8684707972183383e-05, "loss": 0.8088, "num_input_tokens_seen": 24129096, "step": 41840 }, { "epoch": 6.232499255287459, "grad_norm": 0.25598499178886414, "learning_rate": 1.867842011486397e-05, "loss": 0.7816, "num_input_tokens_seen": 24131848, "step": 41845 }, { "epoch": 6.233243967828418, "grad_norm": 0.20277781784534454, "learning_rate": 1.867213268473122e-05, "loss": 0.8174, "num_input_tokens_seen": 24134920, "step": 41850 }, { "epoch": 6.233988680369378, "grad_norm": 0.19569697976112366, "learning_rate": 1.866584568221002e-05, "loss": 0.8238, "num_input_tokens_seen": 24137832, "step": 41855 }, { "epoch": 6.234733392910337, "grad_norm": 0.28639256954193115, "learning_rate": 1.865955910772521e-05, "loss": 0.7956, "num_input_tokens_seen": 24140776, "step": 41860 }, { "epoch": 6.235478105451296, "grad_norm": 0.24405726790428162, "learning_rate": 1.8653272961701618e-05, "loss": 0.8141, "num_input_tokens_seen": 24143560, "step": 41865 }, { "epoch": 6.236222817992255, "grad_norm": 0.28552237153053284, "learning_rate": 1.8646987244564047e-05, "loss": 0.7843, "num_input_tokens_seen": 24146568, "step": 41870 }, { "epoch": 6.236967530533214, "grad_norm": 0.3083479106426239, "learning_rate": 1.8640701956737238e-05, "loss": 0.8164, "num_input_tokens_seen": 24149384, "step": 41875 }, { "epoch": 6.237712243074173, "grad_norm": 0.1970367580652237, "learning_rate": 1.8634417098645937e-05, "loss": 0.8167, "num_input_tokens_seen": 24152360, "step": 41880 }, { "epoch": 6.238456955615132, "grad_norm": 0.26804372668266296, "learning_rate": 1.8628132670714846e-05, "loss": 0.8047, "num_input_tokens_seen": 24155336, "step": 41885 }, { "epoch": 6.239201668156092, "grad_norm": 0.23879839479923248, "learning_rate": 1.8621848673368648e-05, "loss": 0.7868, "num_input_tokens_seen": 24158408, "step": 41890 }, { "epoch": 6.2399463806970505, "grad_norm": 0.2559373378753662, "learning_rate": 1.861556510703198e-05, "loss": 0.8131, "num_input_tokens_seen": 24161640, "step": 41895 }, { "epoch": 6.24069109323801, "grad_norm": 0.21533119678497314, "learning_rate": 1.8609281972129464e-05, "loss": 0.7827, "num_input_tokens_seen": 24164712, "step": 41900 }, { "epoch": 6.241435805778969, "grad_norm": 0.182700514793396, "learning_rate": 1.8602999269085693e-05, "loss": 0.8029, "num_input_tokens_seen": 24167368, "step": 41905 }, { "epoch": 6.242180518319929, "grad_norm": 0.21885520219802856, "learning_rate": 1.859671699832523e-05, "loss": 0.8281, "num_input_tokens_seen": 24170312, "step": 41910 }, { "epoch": 6.242925230860887, "grad_norm": 0.22944431006908417, "learning_rate": 1.859043516027259e-05, "loss": 0.8037, "num_input_tokens_seen": 24173224, "step": 41915 }, { "epoch": 6.243669943401847, "grad_norm": 0.14735634624958038, "learning_rate": 1.8584153755352282e-05, "loss": 0.7953, "num_input_tokens_seen": 24175848, "step": 41920 }, { "epoch": 6.244414655942806, "grad_norm": 0.33728736639022827, "learning_rate": 1.857787278398878e-05, "loss": 0.8074, "num_input_tokens_seen": 24179336, "step": 41925 }, { "epoch": 6.245159368483765, "grad_norm": 0.22810900211334229, "learning_rate": 1.8571592246606522e-05, "loss": 0.7807, "num_input_tokens_seen": 24182152, "step": 41930 }, { "epoch": 6.245904081024724, "grad_norm": 0.22067755460739136, "learning_rate": 1.8565312143629926e-05, "loss": 0.7988, "num_input_tokens_seen": 24185128, "step": 41935 }, { "epoch": 6.246648793565684, "grad_norm": 0.2224918156862259, "learning_rate": 1.855903247548338e-05, "loss": 0.8204, "num_input_tokens_seen": 24188168, "step": 41940 }, { "epoch": 6.247393506106643, "grad_norm": 0.20466028153896332, "learning_rate": 1.855275324259122e-05, "loss": 0.7999, "num_input_tokens_seen": 24190984, "step": 41945 }, { "epoch": 6.248138218647602, "grad_norm": 0.2264515608549118, "learning_rate": 1.8546474445377786e-05, "loss": 0.789, "num_input_tokens_seen": 24193960, "step": 41950 }, { "epoch": 6.248882931188561, "grad_norm": 0.18433362245559692, "learning_rate": 1.8540196084267386e-05, "loss": 0.8223, "num_input_tokens_seen": 24196616, "step": 41955 }, { "epoch": 6.249627643729521, "grad_norm": 0.17704619467258453, "learning_rate": 1.8533918159684262e-05, "loss": 0.8217, "num_input_tokens_seen": 24199368, "step": 41960 }, { "epoch": 6.250372356270479, "grad_norm": 0.22575758397579193, "learning_rate": 1.8527640672052655e-05, "loss": 0.806, "num_input_tokens_seen": 24202440, "step": 41965 }, { "epoch": 6.251117068811439, "grad_norm": 0.4076444208621979, "learning_rate": 1.8521363621796774e-05, "loss": 0.8151, "num_input_tokens_seen": 24205288, "step": 41970 }, { "epoch": 6.251861781352398, "grad_norm": 0.23142510652542114, "learning_rate": 1.8515087009340808e-05, "loss": 0.7955, "num_input_tokens_seen": 24208424, "step": 41975 }, { "epoch": 6.252606493893357, "grad_norm": 0.272227019071579, "learning_rate": 1.850881083510889e-05, "loss": 0.7927, "num_input_tokens_seen": 24211400, "step": 41980 }, { "epoch": 6.253351206434316, "grad_norm": 0.2786649763584137, "learning_rate": 1.850253509952514e-05, "loss": 0.8128, "num_input_tokens_seen": 24214376, "step": 41985 }, { "epoch": 6.254095918975276, "grad_norm": 0.1914311945438385, "learning_rate": 1.8496259803013667e-05, "loss": 0.7941, "num_input_tokens_seen": 24217224, "step": 41990 }, { "epoch": 6.254840631516235, "grad_norm": 0.22341054677963257, "learning_rate": 1.8489984945998512e-05, "loss": 0.777, "num_input_tokens_seen": 24220712, "step": 41995 }, { "epoch": 6.255585344057194, "grad_norm": 0.2445414960384369, "learning_rate": 1.8483710528903698e-05, "loss": 0.8116, "num_input_tokens_seen": 24223752, "step": 42000 }, { "epoch": 6.256330056598153, "grad_norm": 0.1848071813583374, "learning_rate": 1.847743655215323e-05, "loss": 0.8054, "num_input_tokens_seen": 24226984, "step": 42005 }, { "epoch": 6.257074769139113, "grad_norm": 0.22113053500652313, "learning_rate": 1.8471163016171088e-05, "loss": 0.7756, "num_input_tokens_seen": 24229960, "step": 42010 }, { "epoch": 6.257819481680071, "grad_norm": 0.21839746832847595, "learning_rate": 1.84648899213812e-05, "loss": 0.8232, "num_input_tokens_seen": 24232904, "step": 42015 }, { "epoch": 6.258564194221031, "grad_norm": 0.14573100209236145, "learning_rate": 1.845861726820749e-05, "loss": 0.7957, "num_input_tokens_seen": 24235400, "step": 42020 }, { "epoch": 6.25930890676199, "grad_norm": 0.15557464957237244, "learning_rate": 1.845234505707382e-05, "loss": 0.7915, "num_input_tokens_seen": 24238280, "step": 42025 }, { "epoch": 6.2600536193029495, "grad_norm": 0.1970449686050415, "learning_rate": 1.844607328840405e-05, "loss": 0.783, "num_input_tokens_seen": 24241064, "step": 42030 }, { "epoch": 6.260798331843908, "grad_norm": 0.22689510881900787, "learning_rate": 1.8439801962622016e-05, "loss": 0.7803, "num_input_tokens_seen": 24244104, "step": 42035 }, { "epoch": 6.261543044384867, "grad_norm": 0.2509115934371948, "learning_rate": 1.8433531080151482e-05, "loss": 0.7973, "num_input_tokens_seen": 24246920, "step": 42040 }, { "epoch": 6.262287756925827, "grad_norm": 0.23585724830627441, "learning_rate": 1.842726064141622e-05, "loss": 0.8293, "num_input_tokens_seen": 24249640, "step": 42045 }, { "epoch": 6.263032469466786, "grad_norm": 0.2625165283679962, "learning_rate": 1.8420990646839957e-05, "loss": 0.8007, "num_input_tokens_seen": 24252584, "step": 42050 }, { "epoch": 6.263777182007745, "grad_norm": 0.28778979182243347, "learning_rate": 1.841472109684641e-05, "loss": 0.8138, "num_input_tokens_seen": 24256072, "step": 42055 }, { "epoch": 6.264521894548704, "grad_norm": 0.20693297684192657, "learning_rate": 1.8408451991859228e-05, "loss": 0.8034, "num_input_tokens_seen": 24258760, "step": 42060 }, { "epoch": 6.265266607089663, "grad_norm": 0.26932263374328613, "learning_rate": 1.840218333230206e-05, "loss": 0.7915, "num_input_tokens_seen": 24261480, "step": 42065 }, { "epoch": 6.266011319630622, "grad_norm": 0.1490093320608139, "learning_rate": 1.8395915118598523e-05, "loss": 0.8177, "num_input_tokens_seen": 24264136, "step": 42070 }, { "epoch": 6.266756032171582, "grad_norm": 0.2307584136724472, "learning_rate": 1.83896473511722e-05, "loss": 0.8056, "num_input_tokens_seen": 24266696, "step": 42075 }, { "epoch": 6.267500744712541, "grad_norm": 0.2214418649673462, "learning_rate": 1.838338003044662e-05, "loss": 0.797, "num_input_tokens_seen": 24269608, "step": 42080 }, { "epoch": 6.2682454572535, "grad_norm": 0.2670783996582031, "learning_rate": 1.8377113156845317e-05, "loss": 0.7778, "num_input_tokens_seen": 24272328, "step": 42085 }, { "epoch": 6.268990169794459, "grad_norm": 0.3075616657733917, "learning_rate": 1.8370846730791786e-05, "loss": 0.8214, "num_input_tokens_seen": 24275336, "step": 42090 }, { "epoch": 6.269734882335419, "grad_norm": 0.2814196050167084, "learning_rate": 1.8364580752709475e-05, "loss": 0.8029, "num_input_tokens_seen": 24278536, "step": 42095 }, { "epoch": 6.270479594876377, "grad_norm": 0.22470690310001373, "learning_rate": 1.8358315223021814e-05, "loss": 0.8005, "num_input_tokens_seen": 24281224, "step": 42100 }, { "epoch": 6.271224307417337, "grad_norm": 0.21313506364822388, "learning_rate": 1.835205014215222e-05, "loss": 0.7828, "num_input_tokens_seen": 24284104, "step": 42105 }, { "epoch": 6.271969019958296, "grad_norm": 0.2246057689189911, "learning_rate": 1.8345785510524042e-05, "loss": 0.8238, "num_input_tokens_seen": 24286856, "step": 42110 }, { "epoch": 6.2727137324992555, "grad_norm": 0.24661500751972198, "learning_rate": 1.833952132856063e-05, "loss": 0.7927, "num_input_tokens_seen": 24289864, "step": 42115 }, { "epoch": 6.273458445040214, "grad_norm": 0.18273162841796875, "learning_rate": 1.8333257596685284e-05, "loss": 0.775, "num_input_tokens_seen": 24292584, "step": 42120 }, { "epoch": 6.274203157581174, "grad_norm": 0.20961277186870575, "learning_rate": 1.832699431532129e-05, "loss": 0.8011, "num_input_tokens_seen": 24295816, "step": 42125 }, { "epoch": 6.274947870122133, "grad_norm": 0.23075033724308014, "learning_rate": 1.832073148489188e-05, "loss": 0.8038, "num_input_tokens_seen": 24298792, "step": 42130 }, { "epoch": 6.275692582663092, "grad_norm": 0.1970091313123703, "learning_rate": 1.831446910582028e-05, "loss": 0.7903, "num_input_tokens_seen": 24301672, "step": 42135 }, { "epoch": 6.276437295204051, "grad_norm": 0.2819448709487915, "learning_rate": 1.8308207178529684e-05, "loss": 0.8282, "num_input_tokens_seen": 24304424, "step": 42140 }, { "epoch": 6.277182007745011, "grad_norm": 0.20111840963363647, "learning_rate": 1.8301945703443236e-05, "loss": 0.8203, "num_input_tokens_seen": 24307144, "step": 42145 }, { "epoch": 6.277926720285969, "grad_norm": 0.21357190608978271, "learning_rate": 1.8295684680984062e-05, "loss": 0.7763, "num_input_tokens_seen": 24309832, "step": 42150 }, { "epoch": 6.278671432826929, "grad_norm": 0.21429190039634705, "learning_rate": 1.828942411157527e-05, "loss": 0.8198, "num_input_tokens_seen": 24312808, "step": 42155 }, { "epoch": 6.279416145367888, "grad_norm": 0.21040169894695282, "learning_rate": 1.828316399563991e-05, "loss": 0.8341, "num_input_tokens_seen": 24315880, "step": 42160 }, { "epoch": 6.2801608579088475, "grad_norm": 0.2235104739665985, "learning_rate": 1.8276904333601015e-05, "loss": 0.817, "num_input_tokens_seen": 24318728, "step": 42165 }, { "epoch": 6.280905570449806, "grad_norm": 0.19469337165355682, "learning_rate": 1.8270645125881585e-05, "loss": 0.7958, "num_input_tokens_seen": 24321320, "step": 42170 }, { "epoch": 6.281650282990766, "grad_norm": 0.16785842180252075, "learning_rate": 1.8264386372904608e-05, "loss": 0.7981, "num_input_tokens_seen": 24324072, "step": 42175 }, { "epoch": 6.282394995531725, "grad_norm": 0.22901882231235504, "learning_rate": 1.825812807509301e-05, "loss": 0.8064, "num_input_tokens_seen": 24326984, "step": 42180 }, { "epoch": 6.283139708072684, "grad_norm": 0.6249558925628662, "learning_rate": 1.825187023286971e-05, "loss": 0.8341, "num_input_tokens_seen": 24330120, "step": 42185 }, { "epoch": 6.283884420613643, "grad_norm": 0.34781333804130554, "learning_rate": 1.824561284665758e-05, "loss": 0.8403, "num_input_tokens_seen": 24333032, "step": 42190 }, { "epoch": 6.284629133154603, "grad_norm": 0.3464897871017456, "learning_rate": 1.823935591687948e-05, "loss": 0.809, "num_input_tokens_seen": 24336072, "step": 42195 }, { "epoch": 6.2853738456955615, "grad_norm": 0.19094955921173096, "learning_rate": 1.8233099443958212e-05, "loss": 0.797, "num_input_tokens_seen": 24338952, "step": 42200 }, { "epoch": 6.286118558236521, "grad_norm": 0.2043655514717102, "learning_rate": 1.8226843428316576e-05, "loss": 0.8229, "num_input_tokens_seen": 24341864, "step": 42205 }, { "epoch": 6.28686327077748, "grad_norm": 0.24772340059280396, "learning_rate": 1.8220587870377315e-05, "loss": 0.7927, "num_input_tokens_seen": 24344840, "step": 42210 }, { "epoch": 6.2876079833184395, "grad_norm": 0.18109773099422455, "learning_rate": 1.8214332770563165e-05, "loss": 0.7998, "num_input_tokens_seen": 24347976, "step": 42215 }, { "epoch": 6.288352695859398, "grad_norm": 0.20168522000312805, "learning_rate": 1.8208078129296827e-05, "loss": 0.8185, "num_input_tokens_seen": 24350952, "step": 42220 }, { "epoch": 6.289097408400357, "grad_norm": 0.22641043365001678, "learning_rate": 1.8201823947000947e-05, "loss": 0.7963, "num_input_tokens_seen": 24353768, "step": 42225 }, { "epoch": 6.289842120941317, "grad_norm": 0.1721624881029129, "learning_rate": 1.8195570224098162e-05, "loss": 0.7962, "num_input_tokens_seen": 24356680, "step": 42230 }, { "epoch": 6.290586833482275, "grad_norm": 0.1957637220621109, "learning_rate": 1.8189316961011092e-05, "loss": 0.8228, "num_input_tokens_seen": 24359336, "step": 42235 }, { "epoch": 6.291331546023235, "grad_norm": 0.19531777501106262, "learning_rate": 1.818306415816228e-05, "loss": 0.7652, "num_input_tokens_seen": 24362056, "step": 42240 }, { "epoch": 6.292076258564194, "grad_norm": 0.18624626100063324, "learning_rate": 1.817681181597428e-05, "loss": 0.7825, "num_input_tokens_seen": 24364968, "step": 42245 }, { "epoch": 6.2928209711051535, "grad_norm": 0.24433191120624542, "learning_rate": 1.817055993486959e-05, "loss": 0.7974, "num_input_tokens_seen": 24367912, "step": 42250 }, { "epoch": 6.293565683646112, "grad_norm": 0.3326278030872345, "learning_rate": 1.81643085152707e-05, "loss": 0.7977, "num_input_tokens_seen": 24370728, "step": 42255 }, { "epoch": 6.294310396187072, "grad_norm": 0.19404710829257965, "learning_rate": 1.8158057557600045e-05, "loss": 0.8378, "num_input_tokens_seen": 24373544, "step": 42260 }, { "epoch": 6.295055108728031, "grad_norm": 0.2571565508842468, "learning_rate": 1.815180706228004e-05, "loss": 0.8271, "num_input_tokens_seen": 24376680, "step": 42265 }, { "epoch": 6.29579982126899, "grad_norm": 0.3012305200099945, "learning_rate": 1.814555702973308e-05, "loss": 0.7998, "num_input_tokens_seen": 24379496, "step": 42270 }, { "epoch": 6.296544533809949, "grad_norm": 0.2067285031080246, "learning_rate": 1.813930746038151e-05, "loss": 0.7938, "num_input_tokens_seen": 24382664, "step": 42275 }, { "epoch": 6.297289246350909, "grad_norm": 0.22767959535121918, "learning_rate": 1.813305835464764e-05, "loss": 0.8164, "num_input_tokens_seen": 24385448, "step": 42280 }, { "epoch": 6.2980339588918675, "grad_norm": 0.23238058388233185, "learning_rate": 1.8126809712953766e-05, "loss": 0.8077, "num_input_tokens_seen": 24388232, "step": 42285 }, { "epoch": 6.298778671432827, "grad_norm": 0.20627418160438538, "learning_rate": 1.8120561535722153e-05, "loss": 0.7918, "num_input_tokens_seen": 24391048, "step": 42290 }, { "epoch": 6.299523383973786, "grad_norm": 0.17606809735298157, "learning_rate": 1.8114313823375015e-05, "loss": 0.7933, "num_input_tokens_seen": 24393896, "step": 42295 }, { "epoch": 6.3002680965147455, "grad_norm": 0.2202713042497635, "learning_rate": 1.8108066576334554e-05, "loss": 0.8004, "num_input_tokens_seen": 24396712, "step": 42300 }, { "epoch": 6.301012809055704, "grad_norm": 0.2555077373981476, "learning_rate": 1.810181979502294e-05, "loss": 0.7737, "num_input_tokens_seen": 24399816, "step": 42305 }, { "epoch": 6.301757521596664, "grad_norm": 0.18845084309577942, "learning_rate": 1.8095573479862294e-05, "loss": 0.8049, "num_input_tokens_seen": 24402600, "step": 42310 }, { "epoch": 6.302502234137623, "grad_norm": 0.20248526334762573, "learning_rate": 1.8089327631274726e-05, "loss": 0.7983, "num_input_tokens_seen": 24405448, "step": 42315 }, { "epoch": 6.303246946678582, "grad_norm": 0.19716109335422516, "learning_rate": 1.8083082249682294e-05, "loss": 0.8124, "num_input_tokens_seen": 24408392, "step": 42320 }, { "epoch": 6.303991659219541, "grad_norm": 0.30101025104522705, "learning_rate": 1.8076837335507047e-05, "loss": 0.7829, "num_input_tokens_seen": 24411400, "step": 42325 }, { "epoch": 6.304736371760501, "grad_norm": 0.24140682816505432, "learning_rate": 1.8070592889170977e-05, "loss": 0.7974, "num_input_tokens_seen": 24414408, "step": 42330 }, { "epoch": 6.3054810843014595, "grad_norm": 0.14167086780071259, "learning_rate": 1.806434891109607e-05, "loss": 0.8057, "num_input_tokens_seen": 24417256, "step": 42335 }, { "epoch": 6.306225796842419, "grad_norm": 0.24296492338180542, "learning_rate": 1.8058105401704267e-05, "loss": 0.8125, "num_input_tokens_seen": 24420424, "step": 42340 }, { "epoch": 6.306970509383378, "grad_norm": 0.2383994460105896, "learning_rate": 1.8051862361417478e-05, "loss": 0.7973, "num_input_tokens_seen": 24423240, "step": 42345 }, { "epoch": 6.3077152219243375, "grad_norm": 0.14313878118991852, "learning_rate": 1.804561979065758e-05, "loss": 0.7815, "num_input_tokens_seen": 24426088, "step": 42350 }, { "epoch": 6.308459934465296, "grad_norm": 0.1954817920923233, "learning_rate": 1.8039377689846427e-05, "loss": 0.817, "num_input_tokens_seen": 24429224, "step": 42355 }, { "epoch": 6.309204647006256, "grad_norm": 0.27295660972595215, "learning_rate": 1.8033136059405826e-05, "loss": 0.8015, "num_input_tokens_seen": 24432296, "step": 42360 }, { "epoch": 6.309949359547215, "grad_norm": 0.1847294420003891, "learning_rate": 1.8026894899757562e-05, "loss": 0.7764, "num_input_tokens_seen": 24435240, "step": 42365 }, { "epoch": 6.310694072088174, "grad_norm": 0.18564797937870026, "learning_rate": 1.8020654211323396e-05, "loss": 0.8042, "num_input_tokens_seen": 24437992, "step": 42370 }, { "epoch": 6.311438784629133, "grad_norm": 0.2435465306043625, "learning_rate": 1.8014413994525036e-05, "loss": 0.805, "num_input_tokens_seen": 24440712, "step": 42375 }, { "epoch": 6.312183497170093, "grad_norm": 0.20939305424690247, "learning_rate": 1.8008174249784175e-05, "loss": 0.7962, "num_input_tokens_seen": 24443496, "step": 42380 }, { "epoch": 6.3129282097110515, "grad_norm": 0.2043144255876541, "learning_rate": 1.8001934977522477e-05, "loss": 0.7862, "num_input_tokens_seen": 24446728, "step": 42385 }, { "epoch": 6.31367292225201, "grad_norm": 0.20058467984199524, "learning_rate": 1.799569617816156e-05, "loss": 0.788, "num_input_tokens_seen": 24449608, "step": 42390 }, { "epoch": 6.31441763479297, "grad_norm": 0.24265488982200623, "learning_rate": 1.7989457852123026e-05, "loss": 0.8302, "num_input_tokens_seen": 24452712, "step": 42395 }, { "epoch": 6.31516234733393, "grad_norm": 0.28689461946487427, "learning_rate": 1.7983219999828417e-05, "loss": 0.8179, "num_input_tokens_seen": 24455496, "step": 42400 }, { "epoch": 6.315907059874888, "grad_norm": 0.1894288957118988, "learning_rate": 1.7976982621699278e-05, "loss": 0.824, "num_input_tokens_seen": 24458376, "step": 42405 }, { "epoch": 6.316651772415847, "grad_norm": 0.19938497245311737, "learning_rate": 1.7970745718157095e-05, "loss": 0.8081, "num_input_tokens_seen": 24461032, "step": 42410 }, { "epoch": 6.317396484956807, "grad_norm": 0.23322124779224396, "learning_rate": 1.7964509289623335e-05, "loss": 0.808, "num_input_tokens_seen": 24464104, "step": 42415 }, { "epoch": 6.3181411974977655, "grad_norm": 0.20634259283542633, "learning_rate": 1.795827333651944e-05, "loss": 0.8051, "num_input_tokens_seen": 24467368, "step": 42420 }, { "epoch": 6.318885910038725, "grad_norm": 0.19474336504936218, "learning_rate": 1.7952037859266795e-05, "loss": 0.8065, "num_input_tokens_seen": 24470184, "step": 42425 }, { "epoch": 6.319630622579684, "grad_norm": 0.25203344225883484, "learning_rate": 1.7945802858286782e-05, "loss": 0.7978, "num_input_tokens_seen": 24473192, "step": 42430 }, { "epoch": 6.3203753351206435, "grad_norm": 0.23357655107975006, "learning_rate": 1.793956833400074e-05, "loss": 0.8138, "num_input_tokens_seen": 24475880, "step": 42435 }, { "epoch": 6.321120047661602, "grad_norm": 0.17009492218494415, "learning_rate": 1.793333428682996e-05, "loss": 0.797, "num_input_tokens_seen": 24478952, "step": 42440 }, { "epoch": 6.321864760202562, "grad_norm": 0.18371912837028503, "learning_rate": 1.7927100717195712e-05, "loss": 0.8014, "num_input_tokens_seen": 24481608, "step": 42445 }, { "epoch": 6.322609472743521, "grad_norm": 0.21658119559288025, "learning_rate": 1.792086762551924e-05, "loss": 0.8013, "num_input_tokens_seen": 24484744, "step": 42450 }, { "epoch": 6.32335418528448, "grad_norm": 0.19214169681072235, "learning_rate": 1.791463501222176e-05, "loss": 0.7903, "num_input_tokens_seen": 24487720, "step": 42455 }, { "epoch": 6.324098897825439, "grad_norm": 0.2629915773868561, "learning_rate": 1.790840287772443e-05, "loss": 0.7765, "num_input_tokens_seen": 24490792, "step": 42460 }, { "epoch": 6.324843610366399, "grad_norm": 0.2100127786397934, "learning_rate": 1.79021712224484e-05, "loss": 0.8124, "num_input_tokens_seen": 24493512, "step": 42465 }, { "epoch": 6.3255883229073575, "grad_norm": 0.208852618932724, "learning_rate": 1.789594004681479e-05, "loss": 0.8171, "num_input_tokens_seen": 24496360, "step": 42470 }, { "epoch": 6.326333035448317, "grad_norm": 0.20860502123832703, "learning_rate": 1.7889709351244675e-05, "loss": 0.7794, "num_input_tokens_seen": 24499112, "step": 42475 }, { "epoch": 6.327077747989276, "grad_norm": 0.22691801190376282, "learning_rate": 1.788347913615908e-05, "loss": 0.8017, "num_input_tokens_seen": 24501992, "step": 42480 }, { "epoch": 6.327822460530236, "grad_norm": 0.2507566809654236, "learning_rate": 1.7877249401979034e-05, "loss": 0.7947, "num_input_tokens_seen": 24504808, "step": 42485 }, { "epoch": 6.328567173071194, "grad_norm": 0.246464803814888, "learning_rate": 1.7871020149125517e-05, "loss": 0.7794, "num_input_tokens_seen": 24507560, "step": 42490 }, { "epoch": 6.329311885612154, "grad_norm": 0.2413693070411682, "learning_rate": 1.786479137801947e-05, "loss": 0.7939, "num_input_tokens_seen": 24510344, "step": 42495 }, { "epoch": 6.330056598153113, "grad_norm": 0.20346300303936005, "learning_rate": 1.7858563089081812e-05, "loss": 0.7973, "num_input_tokens_seen": 24513288, "step": 42500 }, { "epoch": 6.330801310694072, "grad_norm": 0.25549212098121643, "learning_rate": 1.7852335282733432e-05, "loss": 0.8132, "num_input_tokens_seen": 24515880, "step": 42505 }, { "epoch": 6.331546023235031, "grad_norm": 0.2295803427696228, "learning_rate": 1.7846107959395165e-05, "loss": 0.8084, "num_input_tokens_seen": 24518952, "step": 42510 }, { "epoch": 6.332290735775991, "grad_norm": 0.21480287611484528, "learning_rate": 1.783988111948785e-05, "loss": 0.7752, "num_input_tokens_seen": 24521896, "step": 42515 }, { "epoch": 6.3330354483169495, "grad_norm": 0.26375022530555725, "learning_rate": 1.7833654763432245e-05, "loss": 0.7754, "num_input_tokens_seen": 24524744, "step": 42520 }, { "epoch": 6.333780160857909, "grad_norm": 0.1956643909215927, "learning_rate": 1.782742889164912e-05, "loss": 0.7922, "num_input_tokens_seen": 24527432, "step": 42525 }, { "epoch": 6.334524873398868, "grad_norm": 0.23481349647045135, "learning_rate": 1.7821203504559186e-05, "loss": 0.7885, "num_input_tokens_seen": 24530280, "step": 42530 }, { "epoch": 6.335269585939828, "grad_norm": 0.15852534770965576, "learning_rate": 1.7814978602583136e-05, "loss": 0.8115, "num_input_tokens_seen": 24533000, "step": 42535 }, { "epoch": 6.336014298480786, "grad_norm": 0.19228224456310272, "learning_rate": 1.7808754186141618e-05, "loss": 0.793, "num_input_tokens_seen": 24535784, "step": 42540 }, { "epoch": 6.336759011021746, "grad_norm": 0.27446630597114563, "learning_rate": 1.780253025565525e-05, "loss": 0.7885, "num_input_tokens_seen": 24538760, "step": 42545 }, { "epoch": 6.337503723562705, "grad_norm": 0.1832829713821411, "learning_rate": 1.7796306811544632e-05, "loss": 0.7628, "num_input_tokens_seen": 24541800, "step": 42550 }, { "epoch": 6.338248436103664, "grad_norm": 0.20869964361190796, "learning_rate": 1.779008385423031e-05, "loss": 0.7906, "num_input_tokens_seen": 24544712, "step": 42555 }, { "epoch": 6.338993148644623, "grad_norm": 0.2206176370382309, "learning_rate": 1.778386138413281e-05, "loss": 0.7839, "num_input_tokens_seen": 24547816, "step": 42560 }, { "epoch": 6.339737861185583, "grad_norm": 0.2704941928386688, "learning_rate": 1.7777639401672613e-05, "loss": 0.8115, "num_input_tokens_seen": 24550792, "step": 42565 }, { "epoch": 6.340482573726542, "grad_norm": 0.16903941333293915, "learning_rate": 1.7771417907270187e-05, "loss": 0.766, "num_input_tokens_seen": 24553672, "step": 42570 }, { "epoch": 6.3412272862675, "grad_norm": 0.20129135251045227, "learning_rate": 1.7765196901345946e-05, "loss": 0.7741, "num_input_tokens_seen": 24556456, "step": 42575 }, { "epoch": 6.34197199880846, "grad_norm": 0.2476717084646225, "learning_rate": 1.775897638432028e-05, "loss": 0.8174, "num_input_tokens_seen": 24559528, "step": 42580 }, { "epoch": 6.342716711349419, "grad_norm": 0.17997252941131592, "learning_rate": 1.775275635661356e-05, "loss": 0.8047, "num_input_tokens_seen": 24562152, "step": 42585 }, { "epoch": 6.343461423890378, "grad_norm": 0.25554296374320984, "learning_rate": 1.774653681864609e-05, "loss": 0.8287, "num_input_tokens_seen": 24565000, "step": 42590 }, { "epoch": 6.344206136431337, "grad_norm": 0.20201553404331207, "learning_rate": 1.7740317770838173e-05, "loss": 0.8046, "num_input_tokens_seen": 24567880, "step": 42595 }, { "epoch": 6.344950848972297, "grad_norm": 0.2004196047782898, "learning_rate": 1.7734099213610075e-05, "loss": 0.8063, "num_input_tokens_seen": 24570728, "step": 42600 }, { "epoch": 6.3456955615132555, "grad_norm": 0.23315247893333435, "learning_rate": 1.772788114738201e-05, "loss": 0.8172, "num_input_tokens_seen": 24573576, "step": 42605 }, { "epoch": 6.346440274054215, "grad_norm": 0.2585539221763611, "learning_rate": 1.772166357257416e-05, "loss": 0.811, "num_input_tokens_seen": 24576392, "step": 42610 }, { "epoch": 6.347184986595174, "grad_norm": 0.17843160033226013, "learning_rate": 1.7715446489606696e-05, "loss": 0.8052, "num_input_tokens_seen": 24579144, "step": 42615 }, { "epoch": 6.347929699136134, "grad_norm": 0.2236221730709076, "learning_rate": 1.770922989889975e-05, "loss": 0.7746, "num_input_tokens_seen": 24582184, "step": 42620 }, { "epoch": 6.348674411677092, "grad_norm": 0.16133898496627808, "learning_rate": 1.7703013800873398e-05, "loss": 0.7654, "num_input_tokens_seen": 24584936, "step": 42625 }, { "epoch": 6.349419124218052, "grad_norm": 0.1977464109659195, "learning_rate": 1.7696798195947704e-05, "loss": 0.7595, "num_input_tokens_seen": 24587720, "step": 42630 }, { "epoch": 6.350163836759011, "grad_norm": 0.18869909644126892, "learning_rate": 1.76905830845427e-05, "loss": 0.7905, "num_input_tokens_seen": 24590600, "step": 42635 }, { "epoch": 6.35090854929997, "grad_norm": 0.23806118965148926, "learning_rate": 1.7684368467078384e-05, "loss": 0.7775, "num_input_tokens_seen": 24593608, "step": 42640 }, { "epoch": 6.351653261840929, "grad_norm": 0.21470476686954498, "learning_rate": 1.7678154343974686e-05, "loss": 0.785, "num_input_tokens_seen": 24596456, "step": 42645 }, { "epoch": 6.352397974381889, "grad_norm": 0.23973128199577332, "learning_rate": 1.7671940715651553e-05, "loss": 0.7768, "num_input_tokens_seen": 24599240, "step": 42650 }, { "epoch": 6.353142686922848, "grad_norm": 0.19802190363407135, "learning_rate": 1.7665727582528878e-05, "loss": 0.7921, "num_input_tokens_seen": 24601992, "step": 42655 }, { "epoch": 6.353887399463807, "grad_norm": 0.2384268343448639, "learning_rate": 1.7659514945026508e-05, "loss": 0.8167, "num_input_tokens_seen": 24604584, "step": 42660 }, { "epoch": 6.354632112004766, "grad_norm": 0.24252595007419586, "learning_rate": 1.7653302803564275e-05, "loss": 0.7622, "num_input_tokens_seen": 24607624, "step": 42665 }, { "epoch": 6.355376824545726, "grad_norm": 0.2853763699531555, "learning_rate": 1.7647091158561974e-05, "loss": 0.8181, "num_input_tokens_seen": 24610344, "step": 42670 }, { "epoch": 6.356121537086684, "grad_norm": 0.26960036158561707, "learning_rate": 1.764088001043935e-05, "loss": 0.8171, "num_input_tokens_seen": 24613160, "step": 42675 }, { "epoch": 6.356866249627644, "grad_norm": 0.2008841186761856, "learning_rate": 1.763466935961615e-05, "loss": 0.7999, "num_input_tokens_seen": 24616232, "step": 42680 }, { "epoch": 6.357610962168603, "grad_norm": 0.18551824986934662, "learning_rate": 1.7628459206512033e-05, "loss": 0.7902, "num_input_tokens_seen": 24619240, "step": 42685 }, { "epoch": 6.358355674709562, "grad_norm": 0.2144627720117569, "learning_rate": 1.7622249551546682e-05, "loss": 0.7731, "num_input_tokens_seen": 24622120, "step": 42690 }, { "epoch": 6.359100387250521, "grad_norm": 0.1502605527639389, "learning_rate": 1.7616040395139706e-05, "loss": 0.8144, "num_input_tokens_seen": 24625448, "step": 42695 }, { "epoch": 6.359845099791481, "grad_norm": 0.20839519798755646, "learning_rate": 1.76098317377107e-05, "loss": 0.8113, "num_input_tokens_seen": 24628392, "step": 42700 }, { "epoch": 6.36058981233244, "grad_norm": 0.1897784173488617, "learning_rate": 1.7603623579679217e-05, "loss": 0.7806, "num_input_tokens_seen": 24630920, "step": 42705 }, { "epoch": 6.361334524873399, "grad_norm": 0.2037503868341446, "learning_rate": 1.759741592146478e-05, "loss": 0.8232, "num_input_tokens_seen": 24633800, "step": 42710 }, { "epoch": 6.362079237414358, "grad_norm": 0.23602870106697083, "learning_rate": 1.7591208763486883e-05, "loss": 0.8154, "num_input_tokens_seen": 24636744, "step": 42715 }, { "epoch": 6.362823949955318, "grad_norm": 0.24476806819438934, "learning_rate": 1.7585002106164976e-05, "loss": 0.7918, "num_input_tokens_seen": 24639528, "step": 42720 }, { "epoch": 6.363568662496276, "grad_norm": 0.19285772740840912, "learning_rate": 1.757879594991848e-05, "loss": 0.8208, "num_input_tokens_seen": 24642408, "step": 42725 }, { "epoch": 6.364313375037236, "grad_norm": 0.2100595086812973, "learning_rate": 1.757259029516678e-05, "loss": 0.7923, "num_input_tokens_seen": 24645224, "step": 42730 }, { "epoch": 6.365058087578195, "grad_norm": 0.15812204778194427, "learning_rate": 1.7566385142329227e-05, "loss": 0.8383, "num_input_tokens_seen": 24648040, "step": 42735 }, { "epoch": 6.365802800119154, "grad_norm": 0.20012980699539185, "learning_rate": 1.7560180491825144e-05, "loss": 0.7808, "num_input_tokens_seen": 24651048, "step": 42740 }, { "epoch": 6.366547512660113, "grad_norm": 0.25502604246139526, "learning_rate": 1.7553976344073815e-05, "loss": 0.7957, "num_input_tokens_seen": 24653800, "step": 42745 }, { "epoch": 6.367292225201073, "grad_norm": 0.23422887921333313, "learning_rate": 1.7547772699494494e-05, "loss": 0.8269, "num_input_tokens_seen": 24656744, "step": 42750 }, { "epoch": 6.368036937742032, "grad_norm": 0.22397421300411224, "learning_rate": 1.7541569558506393e-05, "loss": 0.819, "num_input_tokens_seen": 24659432, "step": 42755 }, { "epoch": 6.36878165028299, "grad_norm": 0.1834346055984497, "learning_rate": 1.7535366921528707e-05, "loss": 0.8056, "num_input_tokens_seen": 24662536, "step": 42760 }, { "epoch": 6.36952636282395, "grad_norm": 0.2468140572309494, "learning_rate": 1.752916478898056e-05, "loss": 0.7799, "num_input_tokens_seen": 24665320, "step": 42765 }, { "epoch": 6.370271075364909, "grad_norm": 0.19088882207870483, "learning_rate": 1.7522963161281094e-05, "loss": 0.809, "num_input_tokens_seen": 24668168, "step": 42770 }, { "epoch": 6.371015787905868, "grad_norm": 0.2183600217103958, "learning_rate": 1.751676203884937e-05, "loss": 0.8178, "num_input_tokens_seen": 24670824, "step": 42775 }, { "epoch": 6.371760500446827, "grad_norm": 0.19879190623760223, "learning_rate": 1.7510561422104444e-05, "loss": 0.821, "num_input_tokens_seen": 24673448, "step": 42780 }, { "epoch": 6.372505212987787, "grad_norm": 0.23723164200782776, "learning_rate": 1.7504361311465332e-05, "loss": 0.7988, "num_input_tokens_seen": 24676200, "step": 42785 }, { "epoch": 6.373249925528746, "grad_norm": 0.21429389715194702, "learning_rate": 1.7498161707351005e-05, "loss": 0.7846, "num_input_tokens_seen": 24679208, "step": 42790 }, { "epoch": 6.373994638069705, "grad_norm": 0.1555676907300949, "learning_rate": 1.7491962610180408e-05, "loss": 0.8006, "num_input_tokens_seen": 24681896, "step": 42795 }, { "epoch": 6.374739350610664, "grad_norm": 0.1652889847755432, "learning_rate": 1.748576402037246e-05, "loss": 0.8405, "num_input_tokens_seen": 24684904, "step": 42800 }, { "epoch": 6.375484063151624, "grad_norm": 0.19826221466064453, "learning_rate": 1.7479565938346025e-05, "loss": 0.8314, "num_input_tokens_seen": 24687752, "step": 42805 }, { "epoch": 6.376228775692582, "grad_norm": 0.2681386470794678, "learning_rate": 1.7473368364519945e-05, "loss": 0.8177, "num_input_tokens_seen": 24690344, "step": 42810 }, { "epoch": 6.376973488233542, "grad_norm": 0.2694317102432251, "learning_rate": 1.746717129931303e-05, "loss": 0.8035, "num_input_tokens_seen": 24693320, "step": 42815 }, { "epoch": 6.377718200774501, "grad_norm": 0.19630011916160583, "learning_rate": 1.7460974743144055e-05, "loss": 0.8021, "num_input_tokens_seen": 24696200, "step": 42820 }, { "epoch": 6.3784629133154604, "grad_norm": 0.2014302909374237, "learning_rate": 1.7454778696431747e-05, "loss": 0.7956, "num_input_tokens_seen": 24699240, "step": 42825 }, { "epoch": 6.379207625856419, "grad_norm": 0.20309112966060638, "learning_rate": 1.7448583159594822e-05, "loss": 0.7929, "num_input_tokens_seen": 24702120, "step": 42830 }, { "epoch": 6.379952338397379, "grad_norm": 0.26095885038375854, "learning_rate": 1.744238813305195e-05, "loss": 0.8423, "num_input_tokens_seen": 24705096, "step": 42835 }, { "epoch": 6.380697050938338, "grad_norm": 0.26565879583358765, "learning_rate": 1.743619361722177e-05, "loss": 0.8356, "num_input_tokens_seen": 24708008, "step": 42840 }, { "epoch": 6.381441763479297, "grad_norm": 0.2103959172964096, "learning_rate": 1.7429999612522858e-05, "loss": 0.7863, "num_input_tokens_seen": 24711368, "step": 42845 }, { "epoch": 6.382186476020256, "grad_norm": 0.1898011863231659, "learning_rate": 1.7423806119373794e-05, "loss": 0.7745, "num_input_tokens_seen": 24714056, "step": 42850 }, { "epoch": 6.382931188561216, "grad_norm": 0.26690736413002014, "learning_rate": 1.7417613138193117e-05, "loss": 0.8159, "num_input_tokens_seen": 24716904, "step": 42855 }, { "epoch": 6.383675901102174, "grad_norm": 0.18861213326454163, "learning_rate": 1.7411420669399315e-05, "loss": 0.7878, "num_input_tokens_seen": 24719624, "step": 42860 }, { "epoch": 6.384420613643134, "grad_norm": 0.1850603073835373, "learning_rate": 1.740522871341085e-05, "loss": 0.7811, "num_input_tokens_seen": 24722760, "step": 42865 }, { "epoch": 6.385165326184093, "grad_norm": 0.1743372082710266, "learning_rate": 1.739903727064615e-05, "loss": 0.7993, "num_input_tokens_seen": 24725576, "step": 42870 }, { "epoch": 6.3859100387250525, "grad_norm": 0.23005250096321106, "learning_rate": 1.7392846341523606e-05, "loss": 0.7758, "num_input_tokens_seen": 24728392, "step": 42875 }, { "epoch": 6.386654751266011, "grad_norm": 0.15621419250965118, "learning_rate": 1.7386655926461586e-05, "loss": 0.8129, "num_input_tokens_seen": 24731016, "step": 42880 }, { "epoch": 6.387399463806971, "grad_norm": 0.229890838265419, "learning_rate": 1.73804660258784e-05, "loss": 0.8061, "num_input_tokens_seen": 24733928, "step": 42885 }, { "epoch": 6.38814417634793, "grad_norm": 0.12814760208129883, "learning_rate": 1.737427664019234e-05, "loss": 0.8193, "num_input_tokens_seen": 24737096, "step": 42890 }, { "epoch": 6.388888888888889, "grad_norm": 0.23650318384170532, "learning_rate": 1.736808776982166e-05, "loss": 0.8247, "num_input_tokens_seen": 24739624, "step": 42895 }, { "epoch": 6.389633601429848, "grad_norm": 0.21317528188228607, "learning_rate": 1.7361899415184584e-05, "loss": 0.8029, "num_input_tokens_seen": 24742664, "step": 42900 }, { "epoch": 6.390378313970807, "grad_norm": 0.1864348202943802, "learning_rate": 1.7355711576699286e-05, "loss": 0.7866, "num_input_tokens_seen": 24745544, "step": 42905 }, { "epoch": 6.3911230265117664, "grad_norm": 0.2130715250968933, "learning_rate": 1.734952425478392e-05, "loss": 0.7863, "num_input_tokens_seen": 24748200, "step": 42910 }, { "epoch": 6.391867739052726, "grad_norm": 0.23667190968990326, "learning_rate": 1.7343337449856605e-05, "loss": 0.7904, "num_input_tokens_seen": 24751112, "step": 42915 }, { "epoch": 6.392612451593685, "grad_norm": 0.16892535984516144, "learning_rate": 1.733715116233542e-05, "loss": 0.793, "num_input_tokens_seen": 24754216, "step": 42920 }, { "epoch": 6.393357164134644, "grad_norm": 0.2358270287513733, "learning_rate": 1.7330965392638394e-05, "loss": 0.795, "num_input_tokens_seen": 24757160, "step": 42925 }, { "epoch": 6.394101876675603, "grad_norm": 0.24578776955604553, "learning_rate": 1.732478014118355e-05, "loss": 0.8195, "num_input_tokens_seen": 24760104, "step": 42930 }, { "epoch": 6.394846589216562, "grad_norm": 0.25503525137901306, "learning_rate": 1.7318595408388862e-05, "loss": 0.8243, "num_input_tokens_seen": 24762824, "step": 42935 }, { "epoch": 6.395591301757522, "grad_norm": 0.2405412644147873, "learning_rate": 1.7312411194672258e-05, "loss": 0.7876, "num_input_tokens_seen": 24765576, "step": 42940 }, { "epoch": 6.39633601429848, "grad_norm": 0.185344859957695, "learning_rate": 1.7306227500451654e-05, "loss": 0.8219, "num_input_tokens_seen": 24768328, "step": 42945 }, { "epoch": 6.39708072683944, "grad_norm": 0.2883770763874054, "learning_rate": 1.7300044326144918e-05, "loss": 0.8059, "num_input_tokens_seen": 24771528, "step": 42950 }, { "epoch": 6.397825439380399, "grad_norm": 0.23939229547977448, "learning_rate": 1.7293861672169874e-05, "loss": 0.8041, "num_input_tokens_seen": 24774216, "step": 42955 }, { "epoch": 6.3985701519213585, "grad_norm": 0.18409691751003265, "learning_rate": 1.728767953894434e-05, "loss": 0.7802, "num_input_tokens_seen": 24777096, "step": 42960 }, { "epoch": 6.399314864462317, "grad_norm": 0.21434815227985382, "learning_rate": 1.728149792688606e-05, "loss": 0.8228, "num_input_tokens_seen": 24779848, "step": 42965 }, { "epoch": 6.400059577003277, "grad_norm": 0.16922026872634888, "learning_rate": 1.7275316836412768e-05, "loss": 0.807, "num_input_tokens_seen": 24782856, "step": 42970 }, { "epoch": 6.400804289544236, "grad_norm": 0.1869237869977951, "learning_rate": 1.7269136267942155e-05, "loss": 0.7827, "num_input_tokens_seen": 24785544, "step": 42975 }, { "epoch": 6.401549002085195, "grad_norm": 0.17992767691612244, "learning_rate": 1.7262956221891882e-05, "loss": 0.7879, "num_input_tokens_seen": 24788168, "step": 42980 }, { "epoch": 6.402293714626154, "grad_norm": 0.26751747727394104, "learning_rate": 1.7256776698679577e-05, "loss": 0.8254, "num_input_tokens_seen": 24790952, "step": 42985 }, { "epoch": 6.403038427167114, "grad_norm": 0.25580573081970215, "learning_rate": 1.7250597698722813e-05, "loss": 0.7859, "num_input_tokens_seen": 24793800, "step": 42990 }, { "epoch": 6.4037831397080724, "grad_norm": 0.22355221211910248, "learning_rate": 1.7244419222439152e-05, "loss": 0.7865, "num_input_tokens_seen": 24796840, "step": 42995 }, { "epoch": 6.404527852249032, "grad_norm": 0.13445818424224854, "learning_rate": 1.723824127024612e-05, "loss": 0.7749, "num_input_tokens_seen": 24799720, "step": 43000 }, { "epoch": 6.405272564789991, "grad_norm": 0.2519863545894623, "learning_rate": 1.723206384256118e-05, "loss": 0.7821, "num_input_tokens_seen": 24802728, "step": 43005 }, { "epoch": 6.4060172773309505, "grad_norm": 0.22233277559280396, "learning_rate": 1.7225886939801773e-05, "loss": 0.7782, "num_input_tokens_seen": 24805576, "step": 43010 }, { "epoch": 6.406761989871909, "grad_norm": 0.18662869930267334, "learning_rate": 1.7219710562385324e-05, "loss": 0.7961, "num_input_tokens_seen": 24808328, "step": 43015 }, { "epoch": 6.407506702412869, "grad_norm": 0.14726325869560242, "learning_rate": 1.7213534710729205e-05, "loss": 0.7931, "num_input_tokens_seen": 24811208, "step": 43020 }, { "epoch": 6.408251414953828, "grad_norm": 0.1780375838279724, "learning_rate": 1.7207359385250756e-05, "loss": 0.8263, "num_input_tokens_seen": 24813896, "step": 43025 }, { "epoch": 6.408996127494787, "grad_norm": 0.2337629795074463, "learning_rate": 1.7201184586367272e-05, "loss": 0.7803, "num_input_tokens_seen": 24816744, "step": 43030 }, { "epoch": 6.409740840035746, "grad_norm": 0.2847491204738617, "learning_rate": 1.7195010314496023e-05, "loss": 0.831, "num_input_tokens_seen": 24819624, "step": 43035 }, { "epoch": 6.410485552576706, "grad_norm": 0.22468985617160797, "learning_rate": 1.718883657005426e-05, "loss": 0.8487, "num_input_tokens_seen": 24822472, "step": 43040 }, { "epoch": 6.4112302651176645, "grad_norm": 0.1833522766828537, "learning_rate": 1.718266335345915e-05, "loss": 0.8095, "num_input_tokens_seen": 24825192, "step": 43045 }, { "epoch": 6.411974977658624, "grad_norm": 0.20840206742286682, "learning_rate": 1.7176490665127868e-05, "loss": 0.798, "num_input_tokens_seen": 24827848, "step": 43050 }, { "epoch": 6.412719690199583, "grad_norm": 0.19273319840431213, "learning_rate": 1.7170318505477543e-05, "loss": 0.7786, "num_input_tokens_seen": 24830920, "step": 43055 }, { "epoch": 6.4134644027405425, "grad_norm": 0.2279045432806015, "learning_rate": 1.7164146874925254e-05, "loss": 0.8096, "num_input_tokens_seen": 24833672, "step": 43060 }, { "epoch": 6.414209115281501, "grad_norm": 0.19524285197257996, "learning_rate": 1.715797577388807e-05, "loss": 0.802, "num_input_tokens_seen": 24836584, "step": 43065 }, { "epoch": 6.414953827822461, "grad_norm": 0.21918661892414093, "learning_rate": 1.715180520278299e-05, "loss": 0.8163, "num_input_tokens_seen": 24839336, "step": 43070 }, { "epoch": 6.41569854036342, "grad_norm": 0.1911303997039795, "learning_rate": 1.7145635162027008e-05, "loss": 0.7743, "num_input_tokens_seen": 24842024, "step": 43075 }, { "epoch": 6.416443252904379, "grad_norm": 0.3381752073764801, "learning_rate": 1.7139465652037077e-05, "loss": 0.818, "num_input_tokens_seen": 24844936, "step": 43080 }, { "epoch": 6.417187965445338, "grad_norm": 0.19805890321731567, "learning_rate": 1.7133296673230097e-05, "loss": 0.7997, "num_input_tokens_seen": 24847560, "step": 43085 }, { "epoch": 6.417932677986297, "grad_norm": 0.5024358034133911, "learning_rate": 1.7127128226022936e-05, "loss": 0.8064, "num_input_tokens_seen": 24850760, "step": 43090 }, { "epoch": 6.4186773905272565, "grad_norm": 0.20916961133480072, "learning_rate": 1.7120960310832446e-05, "loss": 0.796, "num_input_tokens_seen": 24853640, "step": 43095 }, { "epoch": 6.419422103068215, "grad_norm": 0.20277142524719238, "learning_rate": 1.7114792928075422e-05, "loss": 0.7905, "num_input_tokens_seen": 24856456, "step": 43100 }, { "epoch": 6.420166815609175, "grad_norm": 0.1614348441362381, "learning_rate": 1.7108626078168634e-05, "loss": 0.7798, "num_input_tokens_seen": 24859208, "step": 43105 }, { "epoch": 6.420911528150134, "grad_norm": 0.23449863493442535, "learning_rate": 1.7102459761528812e-05, "loss": 0.8185, "num_input_tokens_seen": 24862440, "step": 43110 }, { "epoch": 6.421656240691093, "grad_norm": 0.2961275279521942, "learning_rate": 1.709629397857265e-05, "loss": 0.8298, "num_input_tokens_seen": 24865320, "step": 43115 }, { "epoch": 6.422400953232052, "grad_norm": 0.1529615819454193, "learning_rate": 1.7090128729716815e-05, "loss": 0.8059, "num_input_tokens_seen": 24868168, "step": 43120 }, { "epoch": 6.423145665773012, "grad_norm": 0.2786896526813507, "learning_rate": 1.708396401537791e-05, "loss": 0.8042, "num_input_tokens_seen": 24871208, "step": 43125 }, { "epoch": 6.4238903783139705, "grad_norm": 0.17248840630054474, "learning_rate": 1.707779983597253e-05, "loss": 0.8121, "num_input_tokens_seen": 24874312, "step": 43130 }, { "epoch": 6.42463509085493, "grad_norm": 0.2833543121814728, "learning_rate": 1.7071636191917238e-05, "loss": 0.8086, "num_input_tokens_seen": 24877192, "step": 43135 }, { "epoch": 6.425379803395889, "grad_norm": 0.19926553964614868, "learning_rate": 1.706547308362853e-05, "loss": 0.8023, "num_input_tokens_seen": 24880040, "step": 43140 }, { "epoch": 6.4261245159368485, "grad_norm": 0.24567818641662598, "learning_rate": 1.705931051152289e-05, "loss": 0.7626, "num_input_tokens_seen": 24882760, "step": 43145 }, { "epoch": 6.426869228477807, "grad_norm": 0.19690628349781036, "learning_rate": 1.7053148476016774e-05, "loss": 0.8109, "num_input_tokens_seen": 24885480, "step": 43150 }, { "epoch": 6.427613941018767, "grad_norm": 0.24918514490127563, "learning_rate": 1.704698697752656e-05, "loss": 0.7996, "num_input_tokens_seen": 24888648, "step": 43155 }, { "epoch": 6.428358653559726, "grad_norm": 0.2264561802148819, "learning_rate": 1.7040826016468637e-05, "loss": 0.8102, "num_input_tokens_seen": 24891496, "step": 43160 }, { "epoch": 6.429103366100685, "grad_norm": 0.21274268627166748, "learning_rate": 1.7034665593259338e-05, "loss": 0.8136, "num_input_tokens_seen": 24894280, "step": 43165 }, { "epoch": 6.429848078641644, "grad_norm": 0.3431363105773926, "learning_rate": 1.7028505708314953e-05, "loss": 0.7985, "num_input_tokens_seen": 24897096, "step": 43170 }, { "epoch": 6.430592791182604, "grad_norm": 0.202580064535141, "learning_rate": 1.702234636205173e-05, "loss": 0.8218, "num_input_tokens_seen": 24899880, "step": 43175 }, { "epoch": 6.4313375037235625, "grad_norm": 0.24502460658550262, "learning_rate": 1.7016187554885916e-05, "loss": 0.8096, "num_input_tokens_seen": 24902984, "step": 43180 }, { "epoch": 6.432082216264522, "grad_norm": 0.22443148493766785, "learning_rate": 1.7010029287233688e-05, "loss": 0.8166, "num_input_tokens_seen": 24905736, "step": 43185 }, { "epoch": 6.432826928805481, "grad_norm": 0.18685348331928253, "learning_rate": 1.7003871559511187e-05, "loss": 0.7857, "num_input_tokens_seen": 24908328, "step": 43190 }, { "epoch": 6.4335716413464406, "grad_norm": 0.2323155701160431, "learning_rate": 1.6997714372134544e-05, "loss": 0.7899, "num_input_tokens_seen": 24911624, "step": 43195 }, { "epoch": 6.434316353887399, "grad_norm": 0.21622294187545776, "learning_rate": 1.6991557725519824e-05, "loss": 0.8291, "num_input_tokens_seen": 24914376, "step": 43200 }, { "epoch": 6.435061066428359, "grad_norm": 0.2008792757987976, "learning_rate": 1.698540162008308e-05, "loss": 0.7763, "num_input_tokens_seen": 24917224, "step": 43205 }, { "epoch": 6.435805778969318, "grad_norm": 0.1533787101507187, "learning_rate": 1.6979246056240305e-05, "loss": 0.7824, "num_input_tokens_seen": 24919944, "step": 43210 }, { "epoch": 6.436550491510277, "grad_norm": 0.18752312660217285, "learning_rate": 1.6973091034407468e-05, "loss": 0.8146, "num_input_tokens_seen": 24922536, "step": 43215 }, { "epoch": 6.437295204051236, "grad_norm": 0.23113518953323364, "learning_rate": 1.6966936555000507e-05, "loss": 0.8377, "num_input_tokens_seen": 24925640, "step": 43220 }, { "epoch": 6.438039916592196, "grad_norm": 0.24535098671913147, "learning_rate": 1.6960782618435312e-05, "loss": 0.8208, "num_input_tokens_seen": 24928616, "step": 43225 }, { "epoch": 6.4387846291331545, "grad_norm": 0.2603253722190857, "learning_rate": 1.6954629225127745e-05, "loss": 0.8096, "num_input_tokens_seen": 24931496, "step": 43230 }, { "epoch": 6.439529341674114, "grad_norm": 0.15786674618721008, "learning_rate": 1.6948476375493622e-05, "loss": 0.784, "num_input_tokens_seen": 24934344, "step": 43235 }, { "epoch": 6.440274054215073, "grad_norm": 0.20114947855472565, "learning_rate": 1.694232406994873e-05, "loss": 0.8191, "num_input_tokens_seen": 24937192, "step": 43240 }, { "epoch": 6.441018766756033, "grad_norm": 0.26435139775276184, "learning_rate": 1.6936172308908825e-05, "loss": 0.8154, "num_input_tokens_seen": 24939976, "step": 43245 }, { "epoch": 6.441763479296991, "grad_norm": 0.30380499362945557, "learning_rate": 1.693002109278961e-05, "loss": 0.8087, "num_input_tokens_seen": 24942760, "step": 43250 }, { "epoch": 6.44250819183795, "grad_norm": 0.20618997514247894, "learning_rate": 1.6923870422006753e-05, "loss": 0.8075, "num_input_tokens_seen": 24945864, "step": 43255 }, { "epoch": 6.44325290437891, "grad_norm": 0.19884714484214783, "learning_rate": 1.6917720296975898e-05, "loss": 0.8033, "num_input_tokens_seen": 24948744, "step": 43260 }, { "epoch": 6.443997616919869, "grad_norm": 0.18316572904586792, "learning_rate": 1.6911570718112646e-05, "loss": 0.7945, "num_input_tokens_seen": 24951592, "step": 43265 }, { "epoch": 6.444742329460828, "grad_norm": 0.177278533577919, "learning_rate": 1.6905421685832555e-05, "loss": 0.7967, "num_input_tokens_seen": 24954344, "step": 43270 }, { "epoch": 6.445487042001787, "grad_norm": 0.20166748762130737, "learning_rate": 1.689927320055116e-05, "loss": 0.8135, "num_input_tokens_seen": 24957032, "step": 43275 }, { "epoch": 6.4462317545427466, "grad_norm": 0.17430785298347473, "learning_rate": 1.6893125262683952e-05, "loss": 0.8271, "num_input_tokens_seen": 24959976, "step": 43280 }, { "epoch": 6.446976467083705, "grad_norm": 0.25704464316368103, "learning_rate": 1.688697787264638e-05, "loss": 0.7887, "num_input_tokens_seen": 24962920, "step": 43285 }, { "epoch": 6.447721179624665, "grad_norm": 0.23951086401939392, "learning_rate": 1.6880831030853854e-05, "loss": 0.807, "num_input_tokens_seen": 24965896, "step": 43290 }, { "epoch": 6.448465892165624, "grad_norm": 0.2220023274421692, "learning_rate": 1.6874684737721752e-05, "loss": 0.8194, "num_input_tokens_seen": 24968648, "step": 43295 }, { "epoch": 6.449210604706583, "grad_norm": 0.21088679134845734, "learning_rate": 1.6868538993665426e-05, "loss": 0.8001, "num_input_tokens_seen": 24971464, "step": 43300 }, { "epoch": 6.449955317247542, "grad_norm": 0.24069328606128693, "learning_rate": 1.6862393799100166e-05, "loss": 0.7988, "num_input_tokens_seen": 24974280, "step": 43305 }, { "epoch": 6.450700029788502, "grad_norm": 0.21946030855178833, "learning_rate": 1.6856249154441256e-05, "loss": 0.7977, "num_input_tokens_seen": 24977032, "step": 43310 }, { "epoch": 6.4514447423294605, "grad_norm": 0.3028658926486969, "learning_rate": 1.685010506010392e-05, "loss": 0.7966, "num_input_tokens_seen": 24980168, "step": 43315 }, { "epoch": 6.45218945487042, "grad_norm": 0.2101394683122635, "learning_rate": 1.6843961516503344e-05, "loss": 0.8185, "num_input_tokens_seen": 24983240, "step": 43320 }, { "epoch": 6.452934167411379, "grad_norm": 0.1634373664855957, "learning_rate": 1.6837818524054696e-05, "loss": 0.7977, "num_input_tokens_seen": 24985896, "step": 43325 }, { "epoch": 6.453678879952339, "grad_norm": 0.2675904333591461, "learning_rate": 1.683167608317308e-05, "loss": 0.7845, "num_input_tokens_seen": 24989032, "step": 43330 }, { "epoch": 6.454423592493297, "grad_norm": 0.1915777176618576, "learning_rate": 1.6825534194273586e-05, "loss": 0.8047, "num_input_tokens_seen": 24992104, "step": 43335 }, { "epoch": 6.455168305034257, "grad_norm": 0.1798284500837326, "learning_rate": 1.6819392857771253e-05, "loss": 0.7713, "num_input_tokens_seen": 24994792, "step": 43340 }, { "epoch": 6.455913017575216, "grad_norm": 0.265449583530426, "learning_rate": 1.6813252074081094e-05, "loss": 0.7969, "num_input_tokens_seen": 24997768, "step": 43345 }, { "epoch": 6.456657730116175, "grad_norm": 0.21960751712322235, "learning_rate": 1.6807111843618077e-05, "loss": 0.7716, "num_input_tokens_seen": 25000552, "step": 43350 }, { "epoch": 6.457402442657134, "grad_norm": 0.15983134508132935, "learning_rate": 1.6800972166797126e-05, "loss": 0.7903, "num_input_tokens_seen": 25003176, "step": 43355 }, { "epoch": 6.458147155198094, "grad_norm": 0.1879955232143402, "learning_rate": 1.6794833044033147e-05, "loss": 0.7829, "num_input_tokens_seen": 25005832, "step": 43360 }, { "epoch": 6.4588918677390526, "grad_norm": 0.18286339938640594, "learning_rate": 1.678869447574099e-05, "loss": 0.814, "num_input_tokens_seen": 25008552, "step": 43365 }, { "epoch": 6.459636580280012, "grad_norm": 0.1907084584236145, "learning_rate": 1.678255646233548e-05, "loss": 0.8043, "num_input_tokens_seen": 25011368, "step": 43370 }, { "epoch": 6.460381292820971, "grad_norm": 0.17042003571987152, "learning_rate": 1.6776419004231386e-05, "loss": 0.7738, "num_input_tokens_seen": 25014120, "step": 43375 }, { "epoch": 6.461126005361931, "grad_norm": 0.17795436084270477, "learning_rate": 1.677028210184346e-05, "loss": 0.7666, "num_input_tokens_seen": 25016968, "step": 43380 }, { "epoch": 6.461870717902889, "grad_norm": 0.268608421087265, "learning_rate": 1.6764145755586417e-05, "loss": 0.8236, "num_input_tokens_seen": 25019720, "step": 43385 }, { "epoch": 6.462615430443849, "grad_norm": 0.19022375345230103, "learning_rate": 1.675800996587491e-05, "loss": 0.7634, "num_input_tokens_seen": 25022408, "step": 43390 }, { "epoch": 6.463360142984808, "grad_norm": 0.22807586193084717, "learning_rate": 1.675187473312359e-05, "loss": 0.8031, "num_input_tokens_seen": 25025416, "step": 43395 }, { "epoch": 6.464104855525767, "grad_norm": 0.1693851500749588, "learning_rate": 1.6745740057747038e-05, "loss": 0.7891, "num_input_tokens_seen": 25028232, "step": 43400 }, { "epoch": 6.464849568066726, "grad_norm": 0.1828499585390091, "learning_rate": 1.673960594015982e-05, "loss": 0.7937, "num_input_tokens_seen": 25031240, "step": 43405 }, { "epoch": 6.465594280607686, "grad_norm": 0.15779677033424377, "learning_rate": 1.673347238077644e-05, "loss": 0.8124, "num_input_tokens_seen": 25034152, "step": 43410 }, { "epoch": 6.466338993148645, "grad_norm": 0.2938763201236725, "learning_rate": 1.6727339380011386e-05, "loss": 0.8293, "num_input_tokens_seen": 25036744, "step": 43415 }, { "epoch": 6.467083705689604, "grad_norm": 0.22378945350646973, "learning_rate": 1.6721206938279105e-05, "loss": 0.7665, "num_input_tokens_seen": 25039336, "step": 43420 }, { "epoch": 6.467828418230563, "grad_norm": 0.1950903981924057, "learning_rate": 1.6715075055993994e-05, "loss": 0.7807, "num_input_tokens_seen": 25042472, "step": 43425 }, { "epoch": 6.468573130771523, "grad_norm": 0.17782731354236603, "learning_rate": 1.6708943733570437e-05, "loss": 0.7826, "num_input_tokens_seen": 25045704, "step": 43430 }, { "epoch": 6.469317843312481, "grad_norm": 0.1876024305820465, "learning_rate": 1.6702812971422746e-05, "loss": 0.8049, "num_input_tokens_seen": 25048968, "step": 43435 }, { "epoch": 6.47006255585344, "grad_norm": 0.20351524651050568, "learning_rate": 1.669668276996522e-05, "loss": 0.7805, "num_input_tokens_seen": 25051880, "step": 43440 }, { "epoch": 6.4708072683944, "grad_norm": 0.20911556482315063, "learning_rate": 1.6690553129612125e-05, "loss": 0.837, "num_input_tokens_seen": 25054728, "step": 43445 }, { "epoch": 6.4715519809353586, "grad_norm": 0.1627907007932663, "learning_rate": 1.668442405077766e-05, "loss": 0.8353, "num_input_tokens_seen": 25057640, "step": 43450 }, { "epoch": 6.472296693476318, "grad_norm": 0.20387551188468933, "learning_rate": 1.6678295533876006e-05, "loss": 0.8199, "num_input_tokens_seen": 25060296, "step": 43455 }, { "epoch": 6.473041406017277, "grad_norm": 0.21071337163448334, "learning_rate": 1.6672167579321305e-05, "loss": 0.8143, "num_input_tokens_seen": 25063176, "step": 43460 }, { "epoch": 6.473786118558237, "grad_norm": 0.1908666491508484, "learning_rate": 1.6666040187527665e-05, "loss": 0.7968, "num_input_tokens_seen": 25065736, "step": 43465 }, { "epoch": 6.474530831099195, "grad_norm": 0.193235382437706, "learning_rate": 1.665991335890914e-05, "loss": 0.7727, "num_input_tokens_seen": 25068520, "step": 43470 }, { "epoch": 6.475275543640155, "grad_norm": 0.26230186223983765, "learning_rate": 1.6653787093879762e-05, "loss": 0.8045, "num_input_tokens_seen": 25071176, "step": 43475 }, { "epoch": 6.476020256181114, "grad_norm": 0.15252730250358582, "learning_rate": 1.6647661392853525e-05, "loss": 0.7462, "num_input_tokens_seen": 25074120, "step": 43480 }, { "epoch": 6.476764968722073, "grad_norm": 0.19474536180496216, "learning_rate": 1.664153625624438e-05, "loss": 0.7952, "num_input_tokens_seen": 25076904, "step": 43485 }, { "epoch": 6.477509681263032, "grad_norm": 0.21380649507045746, "learning_rate": 1.6635411684466217e-05, "loss": 0.8057, "num_input_tokens_seen": 25080104, "step": 43490 }, { "epoch": 6.478254393803992, "grad_norm": 0.23551122844219208, "learning_rate": 1.6629287677932924e-05, "loss": 0.8163, "num_input_tokens_seen": 25083144, "step": 43495 }, { "epoch": 6.478999106344951, "grad_norm": 0.30657562613487244, "learning_rate": 1.6623164237058347e-05, "loss": 0.7924, "num_input_tokens_seen": 25086024, "step": 43500 }, { "epoch": 6.47974381888591, "grad_norm": 0.1993391513824463, "learning_rate": 1.6617041362256265e-05, "loss": 0.7832, "num_input_tokens_seen": 25088680, "step": 43505 }, { "epoch": 6.480488531426869, "grad_norm": 0.22376620769500732, "learning_rate": 1.6610919053940446e-05, "loss": 0.7771, "num_input_tokens_seen": 25091688, "step": 43510 }, { "epoch": 6.481233243967829, "grad_norm": 0.24026769399642944, "learning_rate": 1.6604797312524613e-05, "loss": 0.8015, "num_input_tokens_seen": 25094440, "step": 43515 }, { "epoch": 6.481977956508787, "grad_norm": 0.26466816663742065, "learning_rate": 1.659867613842244e-05, "loss": 0.8076, "num_input_tokens_seen": 25097288, "step": 43520 }, { "epoch": 6.482722669049747, "grad_norm": 0.15556152164936066, "learning_rate": 1.6592555532047592e-05, "loss": 0.8289, "num_input_tokens_seen": 25100264, "step": 43525 }, { "epoch": 6.483467381590706, "grad_norm": 0.2845221161842346, "learning_rate": 1.6586435493813645e-05, "loss": 0.8131, "num_input_tokens_seen": 25102984, "step": 43530 }, { "epoch": 6.484212094131665, "grad_norm": 0.19732044637203217, "learning_rate": 1.6580316024134186e-05, "loss": 0.7659, "num_input_tokens_seen": 25105768, "step": 43535 }, { "epoch": 6.484956806672624, "grad_norm": 0.1605760157108307, "learning_rate": 1.657419712342273e-05, "loss": 0.7841, "num_input_tokens_seen": 25108488, "step": 43540 }, { "epoch": 6.485701519213584, "grad_norm": 0.2456578016281128, "learning_rate": 1.656807879209278e-05, "loss": 0.7925, "num_input_tokens_seen": 25111336, "step": 43545 }, { "epoch": 6.486446231754543, "grad_norm": 0.2384372502565384, "learning_rate": 1.656196103055779e-05, "loss": 0.7757, "num_input_tokens_seen": 25114248, "step": 43550 }, { "epoch": 6.487190944295502, "grad_norm": 0.25992098450660706, "learning_rate": 1.6555843839231156e-05, "loss": 0.8288, "num_input_tokens_seen": 25117000, "step": 43555 }, { "epoch": 6.487935656836461, "grad_norm": 0.19291682541370392, "learning_rate": 1.654972721852627e-05, "loss": 0.8249, "num_input_tokens_seen": 25119848, "step": 43560 }, { "epoch": 6.488680369377421, "grad_norm": 0.24169740080833435, "learning_rate": 1.6543611168856464e-05, "loss": 0.8204, "num_input_tokens_seen": 25122600, "step": 43565 }, { "epoch": 6.489425081918379, "grad_norm": 0.1913565695285797, "learning_rate": 1.6537495690635034e-05, "loss": 0.806, "num_input_tokens_seen": 25125608, "step": 43570 }, { "epoch": 6.490169794459339, "grad_norm": 0.24221307039260864, "learning_rate": 1.6531380784275237e-05, "loss": 0.8025, "num_input_tokens_seen": 25128808, "step": 43575 }, { "epoch": 6.490914507000298, "grad_norm": 0.19327203929424286, "learning_rate": 1.6525266450190296e-05, "loss": 0.7904, "num_input_tokens_seen": 25131688, "step": 43580 }, { "epoch": 6.4916592195412575, "grad_norm": 0.15961268544197083, "learning_rate": 1.6519152688793387e-05, "loss": 0.8033, "num_input_tokens_seen": 25134408, "step": 43585 }, { "epoch": 6.492403932082216, "grad_norm": 0.2508581876754761, "learning_rate": 1.6513039500497663e-05, "loss": 0.8141, "num_input_tokens_seen": 25137608, "step": 43590 }, { "epoch": 6.493148644623176, "grad_norm": 0.183558389544487, "learning_rate": 1.6506926885716224e-05, "loss": 0.8114, "num_input_tokens_seen": 25140520, "step": 43595 }, { "epoch": 6.493893357164135, "grad_norm": 0.19071289896965027, "learning_rate": 1.6500814844862135e-05, "loss": 0.7694, "num_input_tokens_seen": 25143208, "step": 43600 }, { "epoch": 6.494638069705093, "grad_norm": 0.27474525570869446, "learning_rate": 1.6494703378348433e-05, "loss": 0.8177, "num_input_tokens_seen": 25146088, "step": 43605 }, { "epoch": 6.495382782246053, "grad_norm": 0.1887797862291336, "learning_rate": 1.6488592486588087e-05, "loss": 0.7969, "num_input_tokens_seen": 25149192, "step": 43610 }, { "epoch": 6.496127494787013, "grad_norm": 0.2667287588119507, "learning_rate": 1.6482482169994055e-05, "loss": 0.8108, "num_input_tokens_seen": 25151784, "step": 43615 }, { "epoch": 6.496872207327971, "grad_norm": 0.15425802767276764, "learning_rate": 1.6476372428979254e-05, "loss": 0.7723, "num_input_tokens_seen": 25154504, "step": 43620 }, { "epoch": 6.49761691986893, "grad_norm": 0.1581919938325882, "learning_rate": 1.6470263263956543e-05, "loss": 0.784, "num_input_tokens_seen": 25157480, "step": 43625 }, { "epoch": 6.49836163240989, "grad_norm": 0.22433947026729584, "learning_rate": 1.6464154675338767e-05, "loss": 0.792, "num_input_tokens_seen": 25160392, "step": 43630 }, { "epoch": 6.499106344950849, "grad_norm": 0.2345975935459137, "learning_rate": 1.6458046663538706e-05, "loss": 0.8128, "num_input_tokens_seen": 25163208, "step": 43635 }, { "epoch": 6.499851057491808, "grad_norm": 0.24203942716121674, "learning_rate": 1.6451939228969127e-05, "loss": 0.7966, "num_input_tokens_seen": 25166024, "step": 43640 }, { "epoch": 6.5, "eval_loss": 0.8025602698326111, "eval_runtime": 45.2683, "eval_samples_per_second": 65.918, "eval_steps_per_second": 16.48, "num_input_tokens_seen": 25166536, "step": 43641 }, { "epoch": 6.500595770032767, "grad_norm": 0.19969871640205383, "learning_rate": 1.644583237204275e-05, "loss": 0.7832, "num_input_tokens_seen": 25168680, "step": 43645 }, { "epoch": 6.501340482573727, "grad_norm": 0.26967182755470276, "learning_rate": 1.6439726093172237e-05, "loss": 0.7802, "num_input_tokens_seen": 25171816, "step": 43650 }, { "epoch": 6.502085195114685, "grad_norm": 0.24372361600399017, "learning_rate": 1.6433620392770227e-05, "loss": 0.8041, "num_input_tokens_seen": 25174728, "step": 43655 }, { "epoch": 6.502829907655645, "grad_norm": 0.1482151299715042, "learning_rate": 1.642751527124932e-05, "loss": 0.826, "num_input_tokens_seen": 25177544, "step": 43660 }, { "epoch": 6.503574620196604, "grad_norm": 0.18134184181690216, "learning_rate": 1.6421410729022087e-05, "loss": 0.8148, "num_input_tokens_seen": 25180744, "step": 43665 }, { "epoch": 6.5043193327375635, "grad_norm": 0.2754325270652771, "learning_rate": 1.641530676650103e-05, "loss": 0.8115, "num_input_tokens_seen": 25183912, "step": 43670 }, { "epoch": 6.505064045278522, "grad_norm": 0.18940319120883942, "learning_rate": 1.6409203384098637e-05, "loss": 0.7979, "num_input_tokens_seen": 25186728, "step": 43675 }, { "epoch": 6.505808757819482, "grad_norm": 0.27571171522140503, "learning_rate": 1.640310058222736e-05, "loss": 0.7728, "num_input_tokens_seen": 25189544, "step": 43680 }, { "epoch": 6.506553470360441, "grad_norm": 0.24604414403438568, "learning_rate": 1.6396998361299597e-05, "loss": 0.8163, "num_input_tokens_seen": 25192616, "step": 43685 }, { "epoch": 6.5072981829014, "grad_norm": 0.40465134382247925, "learning_rate": 1.63908967217277e-05, "loss": 0.7888, "num_input_tokens_seen": 25196008, "step": 43690 }, { "epoch": 6.508042895442359, "grad_norm": 0.207739919424057, "learning_rate": 1.6384795663924003e-05, "loss": 0.8137, "num_input_tokens_seen": 25198792, "step": 43695 }, { "epoch": 6.508787607983319, "grad_norm": 0.2222982943058014, "learning_rate": 1.6378695188300787e-05, "loss": 0.7763, "num_input_tokens_seen": 25201512, "step": 43700 }, { "epoch": 6.509532320524277, "grad_norm": 0.21965166926383972, "learning_rate": 1.6372595295270294e-05, "loss": 0.813, "num_input_tokens_seen": 25204520, "step": 43705 }, { "epoch": 6.510277033065237, "grad_norm": 0.34191617369651794, "learning_rate": 1.6366495985244736e-05, "loss": 0.8, "num_input_tokens_seen": 25207304, "step": 43710 }, { "epoch": 6.511021745606196, "grad_norm": 0.15477892756462097, "learning_rate": 1.6360397258636284e-05, "loss": 0.81, "num_input_tokens_seen": 25210152, "step": 43715 }, { "epoch": 6.5117664581471555, "grad_norm": 0.22357258200645447, "learning_rate": 1.6354299115857052e-05, "loss": 0.7847, "num_input_tokens_seen": 25213160, "step": 43720 }, { "epoch": 6.512511170688114, "grad_norm": 0.2205333262681961, "learning_rate": 1.6348201557319148e-05, "loss": 0.8054, "num_input_tokens_seen": 25216040, "step": 43725 }, { "epoch": 6.513255883229074, "grad_norm": 0.24182027578353882, "learning_rate": 1.6342104583434595e-05, "loss": 0.7904, "num_input_tokens_seen": 25218920, "step": 43730 }, { "epoch": 6.514000595770033, "grad_norm": 0.2247457355260849, "learning_rate": 1.633600819461542e-05, "loss": 0.8249, "num_input_tokens_seen": 25221704, "step": 43735 }, { "epoch": 6.514745308310992, "grad_norm": 0.23451244831085205, "learning_rate": 1.632991239127358e-05, "loss": 0.848, "num_input_tokens_seen": 25224552, "step": 43740 }, { "epoch": 6.515490020851951, "grad_norm": 0.22413183748722076, "learning_rate": 1.6323817173821014e-05, "loss": 0.802, "num_input_tokens_seen": 25227528, "step": 43745 }, { "epoch": 6.516234733392911, "grad_norm": 0.2932889461517334, "learning_rate": 1.6317722542669606e-05, "loss": 0.8132, "num_input_tokens_seen": 25230952, "step": 43750 }, { "epoch": 6.5169794459338695, "grad_norm": 0.203317791223526, "learning_rate": 1.6311628498231208e-05, "loss": 0.79, "num_input_tokens_seen": 25233864, "step": 43755 }, { "epoch": 6.517724158474829, "grad_norm": 0.21092773973941803, "learning_rate": 1.6305535040917638e-05, "loss": 0.8311, "num_input_tokens_seen": 25236776, "step": 43760 }, { "epoch": 6.518468871015788, "grad_norm": 0.24858230352401733, "learning_rate": 1.6299442171140656e-05, "loss": 0.7759, "num_input_tokens_seen": 25239784, "step": 43765 }, { "epoch": 6.519213583556747, "grad_norm": 0.16174420714378357, "learning_rate": 1.6293349889312007e-05, "loss": 0.8002, "num_input_tokens_seen": 25242760, "step": 43770 }, { "epoch": 6.519958296097706, "grad_norm": 0.27024152874946594, "learning_rate": 1.6287258195843363e-05, "loss": 0.8094, "num_input_tokens_seen": 25245768, "step": 43775 }, { "epoch": 6.520703008638666, "grad_norm": 0.1723165512084961, "learning_rate": 1.6281167091146392e-05, "loss": 0.7924, "num_input_tokens_seen": 25248680, "step": 43780 }, { "epoch": 6.521447721179625, "grad_norm": 0.18782369792461395, "learning_rate": 1.62750765756327e-05, "loss": 0.7672, "num_input_tokens_seen": 25251304, "step": 43785 }, { "epoch": 6.522192433720583, "grad_norm": 0.19962164759635925, "learning_rate": 1.6268986649713852e-05, "loss": 0.7875, "num_input_tokens_seen": 25254088, "step": 43790 }, { "epoch": 6.522937146261543, "grad_norm": 0.18464712798595428, "learning_rate": 1.6262897313801402e-05, "loss": 0.7651, "num_input_tokens_seen": 25257576, "step": 43795 }, { "epoch": 6.523681858802503, "grad_norm": 0.1842627078294754, "learning_rate": 1.625680856830682e-05, "loss": 0.7827, "num_input_tokens_seen": 25260328, "step": 43800 }, { "epoch": 6.5244265713434615, "grad_norm": 0.21895655989646912, "learning_rate": 1.6250720413641565e-05, "loss": 0.7967, "num_input_tokens_seen": 25263496, "step": 43805 }, { "epoch": 6.52517128388442, "grad_norm": 0.26199692487716675, "learning_rate": 1.6244632850217067e-05, "loss": 0.8082, "num_input_tokens_seen": 25266152, "step": 43810 }, { "epoch": 6.52591599642538, "grad_norm": 0.17795498669147491, "learning_rate": 1.6238545878444676e-05, "loss": 0.799, "num_input_tokens_seen": 25269224, "step": 43815 }, { "epoch": 6.526660708966339, "grad_norm": 0.1752806305885315, "learning_rate": 1.623245949873573e-05, "loss": 0.7869, "num_input_tokens_seen": 25271944, "step": 43820 }, { "epoch": 6.527405421507298, "grad_norm": 0.26420775055885315, "learning_rate": 1.6226373711501523e-05, "loss": 0.8142, "num_input_tokens_seen": 25274728, "step": 43825 }, { "epoch": 6.528150134048257, "grad_norm": 0.1922360509634018, "learning_rate": 1.6220288517153318e-05, "loss": 0.8025, "num_input_tokens_seen": 25277640, "step": 43830 }, { "epoch": 6.528894846589217, "grad_norm": 0.20003242790699005, "learning_rate": 1.621420391610231e-05, "loss": 0.8183, "num_input_tokens_seen": 25280520, "step": 43835 }, { "epoch": 6.5296395591301755, "grad_norm": 0.21195124089717865, "learning_rate": 1.6208119908759684e-05, "loss": 0.7955, "num_input_tokens_seen": 25283336, "step": 43840 }, { "epoch": 6.530384271671135, "grad_norm": 0.23519498109817505, "learning_rate": 1.6202036495536575e-05, "loss": 0.8295, "num_input_tokens_seen": 25286376, "step": 43845 }, { "epoch": 6.531128984212094, "grad_norm": 0.20672303438186646, "learning_rate": 1.6195953676844072e-05, "loss": 0.7934, "num_input_tokens_seen": 25289192, "step": 43850 }, { "epoch": 6.5318736967530535, "grad_norm": 0.2693437337875366, "learning_rate": 1.6189871453093217e-05, "loss": 0.7888, "num_input_tokens_seen": 25292008, "step": 43855 }, { "epoch": 6.532618409294012, "grad_norm": 0.19329065084457397, "learning_rate": 1.6183789824695027e-05, "loss": 0.7956, "num_input_tokens_seen": 25294728, "step": 43860 }, { "epoch": 6.533363121834972, "grad_norm": 0.19889739155769348, "learning_rate": 1.6177708792060486e-05, "loss": 0.7967, "num_input_tokens_seen": 25297608, "step": 43865 }, { "epoch": 6.534107834375931, "grad_norm": 0.19812647998332977, "learning_rate": 1.6171628355600507e-05, "loss": 0.7852, "num_input_tokens_seen": 25300456, "step": 43870 }, { "epoch": 6.53485254691689, "grad_norm": 0.20116807520389557, "learning_rate": 1.6165548515725992e-05, "loss": 0.7906, "num_input_tokens_seen": 25303144, "step": 43875 }, { "epoch": 6.535597259457849, "grad_norm": 0.20421290397644043, "learning_rate": 1.6159469272847793e-05, "loss": 0.8057, "num_input_tokens_seen": 25305832, "step": 43880 }, { "epoch": 6.536341971998809, "grad_norm": 0.2196969836950302, "learning_rate": 1.6153390627376717e-05, "loss": 0.7848, "num_input_tokens_seen": 25308776, "step": 43885 }, { "epoch": 6.5370866845397675, "grad_norm": 0.2792995572090149, "learning_rate": 1.6147312579723542e-05, "loss": 0.8185, "num_input_tokens_seen": 25311720, "step": 43890 }, { "epoch": 6.537831397080727, "grad_norm": 0.24425965547561646, "learning_rate": 1.6141235130298983e-05, "loss": 0.815, "num_input_tokens_seen": 25314536, "step": 43895 }, { "epoch": 6.538576109621686, "grad_norm": 0.23929685354232788, "learning_rate": 1.6135158279513737e-05, "loss": 0.7865, "num_input_tokens_seen": 25317448, "step": 43900 }, { "epoch": 6.5393208221626455, "grad_norm": 0.45301446318626404, "learning_rate": 1.612908202777845e-05, "loss": 0.7999, "num_input_tokens_seen": 25320616, "step": 43905 }, { "epoch": 6.540065534703604, "grad_norm": 0.17048318684101105, "learning_rate": 1.6123006375503737e-05, "loss": 0.8017, "num_input_tokens_seen": 25323592, "step": 43910 }, { "epoch": 6.540810247244564, "grad_norm": 0.21296176314353943, "learning_rate": 1.6116931323100158e-05, "loss": 0.7646, "num_input_tokens_seen": 25326632, "step": 43915 }, { "epoch": 6.541554959785523, "grad_norm": 0.16690686345100403, "learning_rate": 1.6110856870978245e-05, "loss": 0.7895, "num_input_tokens_seen": 25329320, "step": 43920 }, { "epoch": 6.542299672326482, "grad_norm": 0.2711874544620514, "learning_rate": 1.6104783019548486e-05, "loss": 0.77, "num_input_tokens_seen": 25332232, "step": 43925 }, { "epoch": 6.543044384867441, "grad_norm": 0.19975101947784424, "learning_rate": 1.6098709769221333e-05, "loss": 0.7925, "num_input_tokens_seen": 25335112, "step": 43930 }, { "epoch": 6.5437890974084, "grad_norm": 0.21158508956432343, "learning_rate": 1.6092637120407174e-05, "loss": 0.8147, "num_input_tokens_seen": 25338024, "step": 43935 }, { "epoch": 6.5445338099493595, "grad_norm": 0.218624085187912, "learning_rate": 1.6086565073516385e-05, "loss": 0.8098, "num_input_tokens_seen": 25340776, "step": 43940 }, { "epoch": 6.545278522490319, "grad_norm": 0.17711305618286133, "learning_rate": 1.608049362895929e-05, "loss": 0.8281, "num_input_tokens_seen": 25343784, "step": 43945 }, { "epoch": 6.546023235031278, "grad_norm": 0.2235056310892105, "learning_rate": 1.607442278714617e-05, "loss": 0.796, "num_input_tokens_seen": 25346536, "step": 43950 }, { "epoch": 6.546767947572237, "grad_norm": 0.25239744782447815, "learning_rate": 1.6068352548487263e-05, "loss": 0.8368, "num_input_tokens_seen": 25349352, "step": 43955 }, { "epoch": 6.547512660113196, "grad_norm": 0.23736147582530975, "learning_rate": 1.606228291339279e-05, "loss": 0.8045, "num_input_tokens_seen": 25352488, "step": 43960 }, { "epoch": 6.548257372654156, "grad_norm": 0.1767783761024475, "learning_rate": 1.6056213882272892e-05, "loss": 0.829, "num_input_tokens_seen": 25355496, "step": 43965 }, { "epoch": 6.549002085195115, "grad_norm": 0.26414915919303894, "learning_rate": 1.6050145455537708e-05, "loss": 0.8327, "num_input_tokens_seen": 25358408, "step": 43970 }, { "epoch": 6.5497467977360735, "grad_norm": 0.2759600579738617, "learning_rate": 1.6044077633597292e-05, "loss": 0.7988, "num_input_tokens_seen": 25361064, "step": 43975 }, { "epoch": 6.550491510277033, "grad_norm": 0.2549905776977539, "learning_rate": 1.603801041686171e-05, "loss": 0.7986, "num_input_tokens_seen": 25363944, "step": 43980 }, { "epoch": 6.551236222817992, "grad_norm": 0.20642688870429993, "learning_rate": 1.6031943805740934e-05, "loss": 0.8098, "num_input_tokens_seen": 25366728, "step": 43985 }, { "epoch": 6.5519809353589515, "grad_norm": 0.18032683432102203, "learning_rate": 1.602587780064494e-05, "loss": 0.8043, "num_input_tokens_seen": 25369576, "step": 43990 }, { "epoch": 6.55272564789991, "grad_norm": 0.23252448439598083, "learning_rate": 1.601981240198364e-05, "loss": 0.8014, "num_input_tokens_seen": 25372200, "step": 43995 }, { "epoch": 6.55347036044087, "grad_norm": 0.18670378625392914, "learning_rate": 1.6013747610166903e-05, "loss": 0.8219, "num_input_tokens_seen": 25374888, "step": 44000 }, { "epoch": 6.554215072981829, "grad_norm": 0.2515355050563812, "learning_rate": 1.600768342560457e-05, "loss": 0.8145, "num_input_tokens_seen": 25377960, "step": 44005 }, { "epoch": 6.554959785522788, "grad_norm": 0.29538801312446594, "learning_rate": 1.6001619848706435e-05, "loss": 0.8189, "num_input_tokens_seen": 25380744, "step": 44010 }, { "epoch": 6.555704498063747, "grad_norm": 0.23846782743930817, "learning_rate": 1.5995556879882246e-05, "loss": 0.7906, "num_input_tokens_seen": 25383528, "step": 44015 }, { "epoch": 6.556449210604707, "grad_norm": 0.23147690296173096, "learning_rate": 1.5989494519541706e-05, "loss": 0.8021, "num_input_tokens_seen": 25386408, "step": 44020 }, { "epoch": 6.5571939231456655, "grad_norm": 0.19762523472309113, "learning_rate": 1.5983432768094495e-05, "loss": 0.8109, "num_input_tokens_seen": 25389128, "step": 44025 }, { "epoch": 6.557938635686625, "grad_norm": 0.26690053939819336, "learning_rate": 1.597737162595024e-05, "loss": 0.8111, "num_input_tokens_seen": 25392424, "step": 44030 }, { "epoch": 6.558683348227584, "grad_norm": 0.24817316234111786, "learning_rate": 1.5971311093518527e-05, "loss": 0.8003, "num_input_tokens_seen": 25395240, "step": 44035 }, { "epoch": 6.559428060768544, "grad_norm": 0.20837272703647614, "learning_rate": 1.5965251171208896e-05, "loss": 0.776, "num_input_tokens_seen": 25397992, "step": 44040 }, { "epoch": 6.560172773309502, "grad_norm": 0.20637483894824982, "learning_rate": 1.5959191859430867e-05, "loss": 0.7804, "num_input_tokens_seen": 25400872, "step": 44045 }, { "epoch": 6.560917485850462, "grad_norm": 0.2754207253456116, "learning_rate": 1.5953133158593904e-05, "loss": 0.8144, "num_input_tokens_seen": 25403880, "step": 44050 }, { "epoch": 6.561662198391421, "grad_norm": 0.2114211767911911, "learning_rate": 1.5947075069107402e-05, "loss": 0.7879, "num_input_tokens_seen": 25406856, "step": 44055 }, { "epoch": 6.56240691093238, "grad_norm": 0.18944311141967773, "learning_rate": 1.5941017591380764e-05, "loss": 0.7768, "num_input_tokens_seen": 25409608, "step": 44060 }, { "epoch": 6.563151623473339, "grad_norm": 0.21164928376674652, "learning_rate": 1.5934960725823335e-05, "loss": 0.7853, "num_input_tokens_seen": 25412584, "step": 44065 }, { "epoch": 6.563896336014299, "grad_norm": 0.2545106112957001, "learning_rate": 1.5928904472844393e-05, "loss": 0.799, "num_input_tokens_seen": 25415432, "step": 44070 }, { "epoch": 6.5646410485552575, "grad_norm": 0.22380977869033813, "learning_rate": 1.5922848832853217e-05, "loss": 0.7851, "num_input_tokens_seen": 25418216, "step": 44075 }, { "epoch": 6.565385761096217, "grad_norm": 0.28414487838745117, "learning_rate": 1.5916793806259e-05, "loss": 0.8062, "num_input_tokens_seen": 25421064, "step": 44080 }, { "epoch": 6.566130473637176, "grad_norm": 0.23373901844024658, "learning_rate": 1.5910739393470934e-05, "loss": 0.7986, "num_input_tokens_seen": 25423784, "step": 44085 }, { "epoch": 6.566875186178136, "grad_norm": 0.25875797867774963, "learning_rate": 1.5904685594898154e-05, "loss": 0.8282, "num_input_tokens_seen": 25426792, "step": 44090 }, { "epoch": 6.567619898719094, "grad_norm": 0.23722030222415924, "learning_rate": 1.589863241094974e-05, "loss": 0.7729, "num_input_tokens_seen": 25429864, "step": 44095 }, { "epoch": 6.568364611260054, "grad_norm": 0.2020695060491562, "learning_rate": 1.589257984203473e-05, "loss": 0.8018, "num_input_tokens_seen": 25432552, "step": 44100 }, { "epoch": 6.569109323801013, "grad_norm": 0.18751923739910126, "learning_rate": 1.588652788856215e-05, "loss": 0.8165, "num_input_tokens_seen": 25435272, "step": 44105 }, { "epoch": 6.569854036341972, "grad_norm": 0.20230083167552948, "learning_rate": 1.5880476550940975e-05, "loss": 0.8217, "num_input_tokens_seen": 25438056, "step": 44110 }, { "epoch": 6.570598748882931, "grad_norm": 0.2637406289577484, "learning_rate": 1.5874425829580108e-05, "loss": 0.8149, "num_input_tokens_seen": 25441064, "step": 44115 }, { "epoch": 6.57134346142389, "grad_norm": 0.20889435708522797, "learning_rate": 1.586837572488844e-05, "loss": 0.8035, "num_input_tokens_seen": 25443912, "step": 44120 }, { "epoch": 6.57208817396485, "grad_norm": 0.17382612824440002, "learning_rate": 1.586232623727482e-05, "loss": 0.8061, "num_input_tokens_seen": 25446824, "step": 44125 }, { "epoch": 6.572832886505809, "grad_norm": 0.26390540599823, "learning_rate": 1.5856277367148047e-05, "loss": 0.7777, "num_input_tokens_seen": 25450056, "step": 44130 }, { "epoch": 6.573577599046768, "grad_norm": 0.24005037546157837, "learning_rate": 1.5850229114916864e-05, "loss": 0.8157, "num_input_tokens_seen": 25452808, "step": 44135 }, { "epoch": 6.574322311587727, "grad_norm": 0.25196272134780884, "learning_rate": 1.5844181480989995e-05, "loss": 0.7894, "num_input_tokens_seen": 25455560, "step": 44140 }, { "epoch": 6.575067024128686, "grad_norm": 0.2343834489583969, "learning_rate": 1.5838134465776126e-05, "loss": 0.7854, "num_input_tokens_seen": 25458376, "step": 44145 }, { "epoch": 6.575811736669645, "grad_norm": 0.2501795291900635, "learning_rate": 1.583208806968387e-05, "loss": 0.79, "num_input_tokens_seen": 25461224, "step": 44150 }, { "epoch": 6.576556449210605, "grad_norm": 0.202714204788208, "learning_rate": 1.5826042293121835e-05, "loss": 0.8154, "num_input_tokens_seen": 25464456, "step": 44155 }, { "epoch": 6.5773011617515635, "grad_norm": 0.17114634811878204, "learning_rate": 1.581999713649856e-05, "loss": 0.7817, "num_input_tokens_seen": 25467400, "step": 44160 }, { "epoch": 6.578045874292523, "grad_norm": 0.22110415995121002, "learning_rate": 1.5813952600222556e-05, "loss": 0.8536, "num_input_tokens_seen": 25470152, "step": 44165 }, { "epoch": 6.578790586833482, "grad_norm": 0.24830304086208344, "learning_rate": 1.58079086847023e-05, "loss": 0.7937, "num_input_tokens_seen": 25473384, "step": 44170 }, { "epoch": 6.579535299374442, "grad_norm": 0.20802965760231018, "learning_rate": 1.580186539034619e-05, "loss": 0.7831, "num_input_tokens_seen": 25476648, "step": 44175 }, { "epoch": 6.5802800119154, "grad_norm": 0.1752317100763321, "learning_rate": 1.579582271756262e-05, "loss": 0.7919, "num_input_tokens_seen": 25479624, "step": 44180 }, { "epoch": 6.58102472445636, "grad_norm": 0.18983015418052673, "learning_rate": 1.578978066675993e-05, "loss": 0.795, "num_input_tokens_seen": 25482440, "step": 44185 }, { "epoch": 6.581769436997319, "grad_norm": 0.23976647853851318, "learning_rate": 1.578373923834641e-05, "loss": 0.7964, "num_input_tokens_seen": 25485064, "step": 44190 }, { "epoch": 6.582514149538278, "grad_norm": 0.16176000237464905, "learning_rate": 1.5777698432730333e-05, "loss": 0.7758, "num_input_tokens_seen": 25487912, "step": 44195 }, { "epoch": 6.583258862079237, "grad_norm": 0.24008065462112427, "learning_rate": 1.5771658250319895e-05, "loss": 0.7882, "num_input_tokens_seen": 25491048, "step": 44200 }, { "epoch": 6.584003574620197, "grad_norm": 0.2722919285297394, "learning_rate": 1.576561869152327e-05, "loss": 0.8027, "num_input_tokens_seen": 25494056, "step": 44205 }, { "epoch": 6.584748287161156, "grad_norm": 0.16560906171798706, "learning_rate": 1.5759579756748603e-05, "loss": 0.7856, "num_input_tokens_seen": 25497128, "step": 44210 }, { "epoch": 6.585492999702115, "grad_norm": 0.2561321556568146, "learning_rate": 1.5753541446403964e-05, "loss": 0.7853, "num_input_tokens_seen": 25500360, "step": 44215 }, { "epoch": 6.586237712243074, "grad_norm": 0.23338666558265686, "learning_rate": 1.574750376089739e-05, "loss": 0.8006, "num_input_tokens_seen": 25503528, "step": 44220 }, { "epoch": 6.586982424784034, "grad_norm": 0.2636708617210388, "learning_rate": 1.5741466700636898e-05, "loss": 0.8348, "num_input_tokens_seen": 25506440, "step": 44225 }, { "epoch": 6.587727137324992, "grad_norm": 0.24719154834747314, "learning_rate": 1.5735430266030447e-05, "loss": 0.8246, "num_input_tokens_seen": 25509160, "step": 44230 }, { "epoch": 6.588471849865952, "grad_norm": 0.18866361677646637, "learning_rate": 1.5729394457485946e-05, "loss": 0.8006, "num_input_tokens_seen": 25512040, "step": 44235 }, { "epoch": 6.589216562406911, "grad_norm": 0.20285147428512573, "learning_rate": 1.5723359275411283e-05, "loss": 0.7949, "num_input_tokens_seen": 25514664, "step": 44240 }, { "epoch": 6.58996127494787, "grad_norm": 0.2632600963115692, "learning_rate": 1.571732472021428e-05, "loss": 0.8297, "num_input_tokens_seen": 25517448, "step": 44245 }, { "epoch": 6.590705987488829, "grad_norm": 0.283873587846756, "learning_rate": 1.571129079230274e-05, "loss": 0.7697, "num_input_tokens_seen": 25520168, "step": 44250 }, { "epoch": 6.591450700029789, "grad_norm": 0.20890560746192932, "learning_rate": 1.570525749208439e-05, "loss": 0.8542, "num_input_tokens_seen": 25522792, "step": 44255 }, { "epoch": 6.592195412570748, "grad_norm": 0.19314466416835785, "learning_rate": 1.5699224819966957e-05, "loss": 0.7944, "num_input_tokens_seen": 25525640, "step": 44260 }, { "epoch": 6.592940125111707, "grad_norm": 0.21636179089546204, "learning_rate": 1.5693192776358092e-05, "loss": 0.7842, "num_input_tokens_seen": 25528648, "step": 44265 }, { "epoch": 6.593684837652666, "grad_norm": 0.21356917917728424, "learning_rate": 1.568716136166542e-05, "loss": 0.7974, "num_input_tokens_seen": 25531624, "step": 44270 }, { "epoch": 6.594429550193626, "grad_norm": 0.16226792335510254, "learning_rate": 1.5681130576296528e-05, "loss": 0.8039, "num_input_tokens_seen": 25534184, "step": 44275 }, { "epoch": 6.595174262734584, "grad_norm": 0.1812392622232437, "learning_rate": 1.5675100420658935e-05, "loss": 0.8041, "num_input_tokens_seen": 25537096, "step": 44280 }, { "epoch": 6.595918975275543, "grad_norm": 0.17995770275592804, "learning_rate": 1.5669070895160143e-05, "loss": 0.8037, "num_input_tokens_seen": 25539912, "step": 44285 }, { "epoch": 6.596663687816503, "grad_norm": 0.230668306350708, "learning_rate": 1.566304200020761e-05, "loss": 0.7819, "num_input_tokens_seen": 25542952, "step": 44290 }, { "epoch": 6.5974084003574625, "grad_norm": 0.2320222109556198, "learning_rate": 1.565701373620874e-05, "loss": 0.8027, "num_input_tokens_seen": 25545864, "step": 44295 }, { "epoch": 6.598153112898421, "grad_norm": 0.22008390724658966, "learning_rate": 1.5650986103570887e-05, "loss": 0.8092, "num_input_tokens_seen": 25548744, "step": 44300 }, { "epoch": 6.59889782543938, "grad_norm": 0.2360825389623642, "learning_rate": 1.5644959102701384e-05, "loss": 0.7902, "num_input_tokens_seen": 25552296, "step": 44305 }, { "epoch": 6.59964253798034, "grad_norm": 0.2263856679201126, "learning_rate": 1.5638932734007515e-05, "loss": 0.81, "num_input_tokens_seen": 25554952, "step": 44310 }, { "epoch": 6.600387250521299, "grad_norm": 0.21194352209568024, "learning_rate": 1.563290699789651e-05, "loss": 0.8275, "num_input_tokens_seen": 25557704, "step": 44315 }, { "epoch": 6.601131963062258, "grad_norm": 0.16743159294128418, "learning_rate": 1.562688189477556e-05, "loss": 0.7664, "num_input_tokens_seen": 25560648, "step": 44320 }, { "epoch": 6.601876675603217, "grad_norm": 0.17468275129795074, "learning_rate": 1.562085742505183e-05, "loss": 0.8127, "num_input_tokens_seen": 25563848, "step": 44325 }, { "epoch": 6.602621388144176, "grad_norm": 0.293453186750412, "learning_rate": 1.5614833589132427e-05, "loss": 0.8031, "num_input_tokens_seen": 25566920, "step": 44330 }, { "epoch": 6.603366100685135, "grad_norm": 0.17983491718769073, "learning_rate": 1.5608810387424406e-05, "loss": 0.8337, "num_input_tokens_seen": 25569576, "step": 44335 }, { "epoch": 6.604110813226095, "grad_norm": 0.2913016080856323, "learning_rate": 1.5602787820334798e-05, "loss": 0.8396, "num_input_tokens_seen": 25572584, "step": 44340 }, { "epoch": 6.604855525767054, "grad_norm": 0.3302944600582123, "learning_rate": 1.559676588827058e-05, "loss": 0.7998, "num_input_tokens_seen": 25575464, "step": 44345 }, { "epoch": 6.605600238308013, "grad_norm": 0.2731451988220215, "learning_rate": 1.5590744591638693e-05, "loss": 0.8118, "num_input_tokens_seen": 25578248, "step": 44350 }, { "epoch": 6.606344950848972, "grad_norm": 0.18741898238658905, "learning_rate": 1.5584723930846034e-05, "loss": 0.7965, "num_input_tokens_seen": 25580904, "step": 44355 }, { "epoch": 6.607089663389932, "grad_norm": 0.21039578318595886, "learning_rate": 1.557870390629945e-05, "loss": 0.8143, "num_input_tokens_seen": 25584040, "step": 44360 }, { "epoch": 6.60783437593089, "grad_norm": 0.2089434117078781, "learning_rate": 1.5572684518405757e-05, "loss": 0.7779, "num_input_tokens_seen": 25586856, "step": 44365 }, { "epoch": 6.60857908847185, "grad_norm": 0.3306741714477539, "learning_rate": 1.5566665767571708e-05, "loss": 0.8164, "num_input_tokens_seen": 25590088, "step": 44370 }, { "epoch": 6.609323801012809, "grad_norm": 0.17792178690433502, "learning_rate": 1.5560647654204043e-05, "loss": 0.8326, "num_input_tokens_seen": 25592872, "step": 44375 }, { "epoch": 6.6100685135537685, "grad_norm": 0.1938408762216568, "learning_rate": 1.5554630178709427e-05, "loss": 0.8091, "num_input_tokens_seen": 25595560, "step": 44380 }, { "epoch": 6.610813226094727, "grad_norm": 0.26688843965530396, "learning_rate": 1.55486133414945e-05, "loss": 0.8428, "num_input_tokens_seen": 25598568, "step": 44385 }, { "epoch": 6.611557938635687, "grad_norm": 0.20603054761886597, "learning_rate": 1.5542597142965857e-05, "loss": 0.7905, "num_input_tokens_seen": 25601512, "step": 44390 }, { "epoch": 6.612302651176646, "grad_norm": 0.24113719165325165, "learning_rate": 1.5536581583530048e-05, "loss": 0.7969, "num_input_tokens_seen": 25604520, "step": 44395 }, { "epoch": 6.613047363717605, "grad_norm": 0.16831834614276886, "learning_rate": 1.5530566663593584e-05, "loss": 0.7902, "num_input_tokens_seen": 25607400, "step": 44400 }, { "epoch": 6.613792076258564, "grad_norm": 0.21157793700695038, "learning_rate": 1.552455238356292e-05, "loss": 0.8025, "num_input_tokens_seen": 25610344, "step": 44405 }, { "epoch": 6.614536788799524, "grad_norm": 0.19082224369049072, "learning_rate": 1.551853874384448e-05, "loss": 0.7943, "num_input_tokens_seen": 25613288, "step": 44410 }, { "epoch": 6.615281501340482, "grad_norm": 0.22005431354045868, "learning_rate": 1.5512525744844656e-05, "loss": 0.7876, "num_input_tokens_seen": 25615912, "step": 44415 }, { "epoch": 6.616026213881442, "grad_norm": 0.21945758163928986, "learning_rate": 1.5506513386969757e-05, "loss": 0.797, "num_input_tokens_seen": 25618568, "step": 44420 }, { "epoch": 6.616770926422401, "grad_norm": 0.3333865702152252, "learning_rate": 1.550050167062609e-05, "loss": 0.8368, "num_input_tokens_seen": 25621320, "step": 44425 }, { "epoch": 6.6175156389633605, "grad_norm": 0.21070842444896698, "learning_rate": 1.549449059621989e-05, "loss": 0.8389, "num_input_tokens_seen": 25624232, "step": 44430 }, { "epoch": 6.618260351504319, "grad_norm": 0.2102402150630951, "learning_rate": 1.5488480164157375e-05, "loss": 0.8085, "num_input_tokens_seen": 25626984, "step": 44435 }, { "epoch": 6.619005064045279, "grad_norm": 0.23072777688503265, "learning_rate": 1.5482470374844698e-05, "loss": 0.8192, "num_input_tokens_seen": 25629672, "step": 44440 }, { "epoch": 6.619749776586238, "grad_norm": 0.23415951430797577, "learning_rate": 1.5476461228687976e-05, "loss": 0.7881, "num_input_tokens_seen": 25632456, "step": 44445 }, { "epoch": 6.620494489127196, "grad_norm": 0.21166527271270752, "learning_rate": 1.5470452726093287e-05, "loss": 0.7983, "num_input_tokens_seen": 25635688, "step": 44450 }, { "epoch": 6.621239201668156, "grad_norm": 0.2663823068141937, "learning_rate": 1.5464444867466666e-05, "loss": 0.7975, "num_input_tokens_seen": 25638472, "step": 44455 }, { "epoch": 6.621983914209116, "grad_norm": 0.2871273159980774, "learning_rate": 1.5458437653214088e-05, "loss": 0.8061, "num_input_tokens_seen": 25641288, "step": 44460 }, { "epoch": 6.6227286267500745, "grad_norm": 0.28742527961730957, "learning_rate": 1.545243108374149e-05, "loss": 0.8057, "num_input_tokens_seen": 25644392, "step": 44465 }, { "epoch": 6.623473339291033, "grad_norm": 0.1829289346933365, "learning_rate": 1.544642515945479e-05, "loss": 0.7876, "num_input_tokens_seen": 25647240, "step": 44470 }, { "epoch": 6.624218051831993, "grad_norm": 0.2525111138820648, "learning_rate": 1.5440419880759838e-05, "loss": 0.8087, "num_input_tokens_seen": 25649928, "step": 44475 }, { "epoch": 6.6249627643729525, "grad_norm": 0.24039052426815033, "learning_rate": 1.5434415248062435e-05, "loss": 0.8098, "num_input_tokens_seen": 25652808, "step": 44480 }, { "epoch": 6.625707476913911, "grad_norm": 0.21495909988880157, "learning_rate": 1.542841126176836e-05, "loss": 0.7993, "num_input_tokens_seen": 25655688, "step": 44485 }, { "epoch": 6.62645218945487, "grad_norm": 0.4195135533809662, "learning_rate": 1.5422407922283343e-05, "loss": 0.7645, "num_input_tokens_seen": 25658472, "step": 44490 }, { "epoch": 6.62719690199583, "grad_norm": 0.20277047157287598, "learning_rate": 1.5416405230013065e-05, "loss": 0.8008, "num_input_tokens_seen": 25661224, "step": 44495 }, { "epoch": 6.627941614536788, "grad_norm": 0.24206271767616272, "learning_rate": 1.5410403185363147e-05, "loss": 0.7972, "num_input_tokens_seen": 25664424, "step": 44500 }, { "epoch": 6.628686327077748, "grad_norm": 0.20260220766067505, "learning_rate": 1.540440178873919e-05, "loss": 0.8021, "num_input_tokens_seen": 25667464, "step": 44505 }, { "epoch": 6.629431039618707, "grad_norm": 0.2525064945220947, "learning_rate": 1.539840104054676e-05, "loss": 0.8147, "num_input_tokens_seen": 25670504, "step": 44510 }, { "epoch": 6.6301757521596665, "grad_norm": 0.24175399541854858, "learning_rate": 1.5392400941191337e-05, "loss": 0.7982, "num_input_tokens_seen": 25673480, "step": 44515 }, { "epoch": 6.630920464700625, "grad_norm": 0.3103911280632019, "learning_rate": 1.53864014910784e-05, "loss": 0.8033, "num_input_tokens_seen": 25676296, "step": 44520 }, { "epoch": 6.631665177241585, "grad_norm": 0.26724985241889954, "learning_rate": 1.538040269061337e-05, "loss": 0.7868, "num_input_tokens_seen": 25679176, "step": 44525 }, { "epoch": 6.632409889782544, "grad_norm": 0.246278315782547, "learning_rate": 1.5374404540201612e-05, "loss": 0.8326, "num_input_tokens_seen": 25682024, "step": 44530 }, { "epoch": 6.633154602323503, "grad_norm": 0.23272711038589478, "learning_rate": 1.5368407040248467e-05, "loss": 0.8058, "num_input_tokens_seen": 25684904, "step": 44535 }, { "epoch": 6.633899314864462, "grad_norm": 0.17785175144672394, "learning_rate": 1.536241019115921e-05, "loss": 0.7825, "num_input_tokens_seen": 25687784, "step": 44540 }, { "epoch": 6.634644027405422, "grad_norm": 0.2843380868434906, "learning_rate": 1.5356413993339088e-05, "loss": 0.7809, "num_input_tokens_seen": 25691016, "step": 44545 }, { "epoch": 6.6353887399463805, "grad_norm": 0.168731227517128, "learning_rate": 1.5350418447193298e-05, "loss": 0.8172, "num_input_tokens_seen": 25693896, "step": 44550 }, { "epoch": 6.63613345248734, "grad_norm": 0.2648715376853943, "learning_rate": 1.5344423553126997e-05, "loss": 0.8078, "num_input_tokens_seen": 25696776, "step": 44555 }, { "epoch": 6.636878165028299, "grad_norm": 0.20344536006450653, "learning_rate": 1.53384293115453e-05, "loss": 0.8092, "num_input_tokens_seen": 25699912, "step": 44560 }, { "epoch": 6.6376228775692585, "grad_norm": 0.19920869171619415, "learning_rate": 1.5332435722853263e-05, "loss": 0.8047, "num_input_tokens_seen": 25702632, "step": 44565 }, { "epoch": 6.638367590110217, "grad_norm": 0.24066302180290222, "learning_rate": 1.532644278745592e-05, "loss": 0.807, "num_input_tokens_seen": 25705352, "step": 44570 }, { "epoch": 6.639112302651177, "grad_norm": 0.22180700302124023, "learning_rate": 1.5320450505758247e-05, "loss": 0.7941, "num_input_tokens_seen": 25708264, "step": 44575 }, { "epoch": 6.639857015192136, "grad_norm": 0.24367505311965942, "learning_rate": 1.531445887816517e-05, "loss": 0.7945, "num_input_tokens_seen": 25710984, "step": 44580 }, { "epoch": 6.640601727733095, "grad_norm": 0.26467469334602356, "learning_rate": 1.530846790508158e-05, "loss": 0.7659, "num_input_tokens_seen": 25713864, "step": 44585 }, { "epoch": 6.641346440274054, "grad_norm": 0.2868402600288391, "learning_rate": 1.5302477586912333e-05, "loss": 0.8195, "num_input_tokens_seen": 25716744, "step": 44590 }, { "epoch": 6.642091152815014, "grad_norm": 0.26834896206855774, "learning_rate": 1.5296487924062218e-05, "loss": 0.7794, "num_input_tokens_seen": 25719976, "step": 44595 }, { "epoch": 6.6428358653559725, "grad_norm": 0.25730404257774353, "learning_rate": 1.5290498916935995e-05, "loss": 0.7912, "num_input_tokens_seen": 25723016, "step": 44600 }, { "epoch": 6.643580577896932, "grad_norm": 0.2574942708015442, "learning_rate": 1.5284510565938385e-05, "loss": 0.8029, "num_input_tokens_seen": 25725672, "step": 44605 }, { "epoch": 6.644325290437891, "grad_norm": 0.20610639452934265, "learning_rate": 1.5278522871474045e-05, "loss": 0.7956, "num_input_tokens_seen": 25728584, "step": 44610 }, { "epoch": 6.6450700029788505, "grad_norm": 0.34616339206695557, "learning_rate": 1.527253583394762e-05, "loss": 0.8188, "num_input_tokens_seen": 25731688, "step": 44615 }, { "epoch": 6.645814715519809, "grad_norm": 0.19194166362285614, "learning_rate": 1.5266549453763655e-05, "loss": 0.8028, "num_input_tokens_seen": 25734504, "step": 44620 }, { "epoch": 6.646559428060769, "grad_norm": 0.1958639919757843, "learning_rate": 1.5260563731326715e-05, "loss": 0.8186, "num_input_tokens_seen": 25737288, "step": 44625 }, { "epoch": 6.647304140601728, "grad_norm": 0.2056252807378769, "learning_rate": 1.5254578667041278e-05, "loss": 0.8339, "num_input_tokens_seen": 25740008, "step": 44630 }, { "epoch": 6.6480488531426865, "grad_norm": 0.23036333918571472, "learning_rate": 1.5248594261311789e-05, "loss": 0.8347, "num_input_tokens_seen": 25742792, "step": 44635 }, { "epoch": 6.648793565683646, "grad_norm": 0.27479425072669983, "learning_rate": 1.524261051454266e-05, "loss": 0.8208, "num_input_tokens_seen": 25745416, "step": 44640 }, { "epoch": 6.649538278224606, "grad_norm": 0.25671812891960144, "learning_rate": 1.5236627427138237e-05, "loss": 0.7969, "num_input_tokens_seen": 25748552, "step": 44645 }, { "epoch": 6.6502829907655645, "grad_norm": 0.3039012551307678, "learning_rate": 1.5230644999502835e-05, "loss": 0.7963, "num_input_tokens_seen": 25751400, "step": 44650 }, { "epoch": 6.651027703306523, "grad_norm": 0.2115037590265274, "learning_rate": 1.5224663232040736e-05, "loss": 0.7834, "num_input_tokens_seen": 25754568, "step": 44655 }, { "epoch": 6.651772415847483, "grad_norm": 0.22584553062915802, "learning_rate": 1.5218682125156148e-05, "loss": 0.7916, "num_input_tokens_seen": 25757448, "step": 44660 }, { "epoch": 6.652517128388443, "grad_norm": 0.21429447829723358, "learning_rate": 1.521270167925325e-05, "loss": 0.8019, "num_input_tokens_seen": 25760296, "step": 44665 }, { "epoch": 6.653261840929401, "grad_norm": 0.26458197832107544, "learning_rate": 1.5206721894736178e-05, "loss": 0.7939, "num_input_tokens_seen": 25763176, "step": 44670 }, { "epoch": 6.65400655347036, "grad_norm": 0.24071189761161804, "learning_rate": 1.520074277200903e-05, "loss": 0.8081, "num_input_tokens_seen": 25766216, "step": 44675 }, { "epoch": 6.65475126601132, "grad_norm": 0.2896532416343689, "learning_rate": 1.519476431147584e-05, "loss": 0.798, "num_input_tokens_seen": 25769192, "step": 44680 }, { "epoch": 6.6554959785522785, "grad_norm": 0.19094328582286835, "learning_rate": 1.518878651354061e-05, "loss": 0.7915, "num_input_tokens_seen": 25772200, "step": 44685 }, { "epoch": 6.656240691093238, "grad_norm": 0.18877294659614563, "learning_rate": 1.5182809378607304e-05, "loss": 0.8001, "num_input_tokens_seen": 25774888, "step": 44690 }, { "epoch": 6.656985403634197, "grad_norm": 0.270903080701828, "learning_rate": 1.5176832907079836e-05, "loss": 0.8067, "num_input_tokens_seen": 25777864, "step": 44695 }, { "epoch": 6.6577301161751565, "grad_norm": 0.25661617517471313, "learning_rate": 1.5170857099362045e-05, "loss": 0.8232, "num_input_tokens_seen": 25780520, "step": 44700 }, { "epoch": 6.658474828716115, "grad_norm": 0.1232149600982666, "learning_rate": 1.5164881955857774e-05, "loss": 0.7703, "num_input_tokens_seen": 25783368, "step": 44705 }, { "epoch": 6.659219541257075, "grad_norm": 0.24211788177490234, "learning_rate": 1.5158907476970796e-05, "loss": 0.7842, "num_input_tokens_seen": 25786664, "step": 44710 }, { "epoch": 6.659964253798034, "grad_norm": 0.2500284016132355, "learning_rate": 1.5152933663104834e-05, "loss": 0.8216, "num_input_tokens_seen": 25789672, "step": 44715 }, { "epoch": 6.660708966338993, "grad_norm": 0.21196995675563812, "learning_rate": 1.5146960514663583e-05, "loss": 0.8153, "num_input_tokens_seen": 25792456, "step": 44720 }, { "epoch": 6.661453678879952, "grad_norm": 0.30604270100593567, "learning_rate": 1.5140988032050685e-05, "loss": 0.7913, "num_input_tokens_seen": 25795176, "step": 44725 }, { "epoch": 6.662198391420912, "grad_norm": 0.23272839188575745, "learning_rate": 1.5135016215669724e-05, "loss": 0.8069, "num_input_tokens_seen": 25798152, "step": 44730 }, { "epoch": 6.6629431039618705, "grad_norm": 0.15636534988880157, "learning_rate": 1.5129045065924271e-05, "loss": 0.8241, "num_input_tokens_seen": 25801064, "step": 44735 }, { "epoch": 6.66368781650283, "grad_norm": 0.15481428802013397, "learning_rate": 1.5123074583217812e-05, "loss": 0.7791, "num_input_tokens_seen": 25803752, "step": 44740 }, { "epoch": 6.664432529043789, "grad_norm": 0.187606081366539, "learning_rate": 1.5117104767953818e-05, "loss": 0.7882, "num_input_tokens_seen": 25806984, "step": 44745 }, { "epoch": 6.665177241584749, "grad_norm": 0.2585318386554718, "learning_rate": 1.51111356205357e-05, "loss": 0.8127, "num_input_tokens_seen": 25809832, "step": 44750 }, { "epoch": 6.665921954125707, "grad_norm": 0.26145997643470764, "learning_rate": 1.5105167141366836e-05, "loss": 0.7922, "num_input_tokens_seen": 25812840, "step": 44755 }, { "epoch": 6.666666666666667, "grad_norm": 0.2962815761566162, "learning_rate": 1.509919933085054e-05, "loss": 0.8149, "num_input_tokens_seen": 25815880, "step": 44760 }, { "epoch": 6.667411379207626, "grad_norm": 0.2621111571788788, "learning_rate": 1.5093232189390103e-05, "loss": 0.7976, "num_input_tokens_seen": 25818696, "step": 44765 }, { "epoch": 6.668156091748585, "grad_norm": 0.3044244945049286, "learning_rate": 1.508726571738876e-05, "loss": 0.7815, "num_input_tokens_seen": 25822312, "step": 44770 }, { "epoch": 6.668900804289544, "grad_norm": 0.34553200006484985, "learning_rate": 1.5081299915249702e-05, "loss": 0.7882, "num_input_tokens_seen": 25825832, "step": 44775 }, { "epoch": 6.669645516830504, "grad_norm": 0.3194652199745178, "learning_rate": 1.507533478337606e-05, "loss": 0.7961, "num_input_tokens_seen": 25828744, "step": 44780 }, { "epoch": 6.6703902293714625, "grad_norm": 0.1656333953142166, "learning_rate": 1.5069370322170941e-05, "loss": 0.7994, "num_input_tokens_seen": 25831752, "step": 44785 }, { "epoch": 6.671134941912422, "grad_norm": 0.18527427315711975, "learning_rate": 1.5063406532037408e-05, "loss": 0.8128, "num_input_tokens_seen": 25834664, "step": 44790 }, { "epoch": 6.671879654453381, "grad_norm": 0.19159236550331116, "learning_rate": 1.5057443413378458e-05, "loss": 0.7873, "num_input_tokens_seen": 25837384, "step": 44795 }, { "epoch": 6.67262436699434, "grad_norm": 0.19226545095443726, "learning_rate": 1.5051480966597054e-05, "loss": 0.8092, "num_input_tokens_seen": 25840328, "step": 44800 }, { "epoch": 6.673369079535299, "grad_norm": 0.1871183216571808, "learning_rate": 1.5045519192096128e-05, "loss": 0.795, "num_input_tokens_seen": 25843336, "step": 44805 }, { "epoch": 6.674113792076259, "grad_norm": 0.13569021224975586, "learning_rate": 1.5039558090278538e-05, "loss": 0.7939, "num_input_tokens_seen": 25846056, "step": 44810 }, { "epoch": 6.674858504617218, "grad_norm": 0.21338163316249847, "learning_rate": 1.5033597661547123e-05, "loss": 0.8156, "num_input_tokens_seen": 25848904, "step": 44815 }, { "epoch": 6.6756032171581765, "grad_norm": 0.14679963886737823, "learning_rate": 1.5027637906304648e-05, "loss": 0.7917, "num_input_tokens_seen": 25851624, "step": 44820 }, { "epoch": 6.676347929699136, "grad_norm": 0.17491431534290314, "learning_rate": 1.5021678824953867e-05, "loss": 0.81, "num_input_tokens_seen": 25854568, "step": 44825 }, { "epoch": 6.677092642240096, "grad_norm": 0.18497948348522186, "learning_rate": 1.5015720417897456e-05, "loss": 0.8335, "num_input_tokens_seen": 25857512, "step": 44830 }, { "epoch": 6.677837354781055, "grad_norm": 0.20333810150623322, "learning_rate": 1.5009762685538065e-05, "loss": 0.8038, "num_input_tokens_seen": 25860360, "step": 44835 }, { "epoch": 6.678582067322013, "grad_norm": 0.15533429384231567, "learning_rate": 1.5003805628278297e-05, "loss": 0.7923, "num_input_tokens_seen": 25863304, "step": 44840 }, { "epoch": 6.679326779862973, "grad_norm": 0.2036246657371521, "learning_rate": 1.49978492465207e-05, "loss": 0.8048, "num_input_tokens_seen": 25865928, "step": 44845 }, { "epoch": 6.680071492403932, "grad_norm": 0.2144298255443573, "learning_rate": 1.4991893540667783e-05, "loss": 0.7867, "num_input_tokens_seen": 25869096, "step": 44850 }, { "epoch": 6.680816204944891, "grad_norm": 0.2158402055501938, "learning_rate": 1.4985938511122027e-05, "loss": 0.8034, "num_input_tokens_seen": 25872008, "step": 44855 }, { "epoch": 6.68156091748585, "grad_norm": 0.2658008337020874, "learning_rate": 1.497998415828582e-05, "loss": 0.7981, "num_input_tokens_seen": 25875240, "step": 44860 }, { "epoch": 6.68230563002681, "grad_norm": 0.21337063610553741, "learning_rate": 1.4974030482561546e-05, "loss": 0.7607, "num_input_tokens_seen": 25878472, "step": 44865 }, { "epoch": 6.6830503425677685, "grad_norm": 0.2189393937587738, "learning_rate": 1.4968077484351529e-05, "loss": 0.8059, "num_input_tokens_seen": 25881288, "step": 44870 }, { "epoch": 6.683795055108728, "grad_norm": 0.21784910559654236, "learning_rate": 1.496212516405805e-05, "loss": 0.7985, "num_input_tokens_seen": 25884104, "step": 44875 }, { "epoch": 6.684539767649687, "grad_norm": 0.2782020568847656, "learning_rate": 1.4956173522083338e-05, "loss": 0.7996, "num_input_tokens_seen": 25886856, "step": 44880 }, { "epoch": 6.685284480190647, "grad_norm": 0.21392560005187988, "learning_rate": 1.4950222558829582e-05, "loss": 0.7969, "num_input_tokens_seen": 25889896, "step": 44885 }, { "epoch": 6.686029192731605, "grad_norm": 0.2830999791622162, "learning_rate": 1.4944272274698935e-05, "loss": 0.7955, "num_input_tokens_seen": 25892904, "step": 44890 }, { "epoch": 6.686773905272565, "grad_norm": 0.18796081840991974, "learning_rate": 1.4938322670093485e-05, "loss": 0.7978, "num_input_tokens_seen": 25895624, "step": 44895 }, { "epoch": 6.687518617813524, "grad_norm": 0.25052517652511597, "learning_rate": 1.4932373745415273e-05, "loss": 0.7965, "num_input_tokens_seen": 25898632, "step": 44900 }, { "epoch": 6.688263330354483, "grad_norm": 0.264343798160553, "learning_rate": 1.4926425501066313e-05, "loss": 0.8269, "num_input_tokens_seen": 25901256, "step": 44905 }, { "epoch": 6.689008042895442, "grad_norm": 0.26470762491226196, "learning_rate": 1.4920477937448565e-05, "loss": 0.8166, "num_input_tokens_seen": 25903848, "step": 44910 }, { "epoch": 6.689752755436402, "grad_norm": 0.2740026116371155, "learning_rate": 1.4914531054963931e-05, "loss": 0.8067, "num_input_tokens_seen": 25906696, "step": 44915 }, { "epoch": 6.690497467977361, "grad_norm": 0.2104543149471283, "learning_rate": 1.4908584854014294e-05, "loss": 0.8034, "num_input_tokens_seen": 25909992, "step": 44920 }, { "epoch": 6.69124218051832, "grad_norm": 0.20594029128551483, "learning_rate": 1.4902639335001456e-05, "loss": 0.7908, "num_input_tokens_seen": 25913096, "step": 44925 }, { "epoch": 6.691986893059279, "grad_norm": 0.26437628269195557, "learning_rate": 1.4896694498327195e-05, "loss": 0.8082, "num_input_tokens_seen": 25916168, "step": 44930 }, { "epoch": 6.692731605600239, "grad_norm": 0.18708069622516632, "learning_rate": 1.4890750344393254e-05, "loss": 0.7884, "num_input_tokens_seen": 25919144, "step": 44935 }, { "epoch": 6.693476318141197, "grad_norm": 0.19613786041736603, "learning_rate": 1.4884806873601303e-05, "loss": 0.7785, "num_input_tokens_seen": 25922184, "step": 44940 }, { "epoch": 6.694221030682157, "grad_norm": 0.23065289855003357, "learning_rate": 1.4878864086352973e-05, "loss": 0.7969, "num_input_tokens_seen": 25925064, "step": 44945 }, { "epoch": 6.694965743223116, "grad_norm": 0.21612733602523804, "learning_rate": 1.4872921983049854e-05, "loss": 0.8081, "num_input_tokens_seen": 25927688, "step": 44950 }, { "epoch": 6.695710455764075, "grad_norm": 0.30288195610046387, "learning_rate": 1.4866980564093503e-05, "loss": 0.7865, "num_input_tokens_seen": 25930696, "step": 44955 }, { "epoch": 6.696455168305034, "grad_norm": 0.17858490347862244, "learning_rate": 1.4861039829885398e-05, "loss": 0.8104, "num_input_tokens_seen": 25933704, "step": 44960 }, { "epoch": 6.697199880845994, "grad_norm": 0.2641066908836365, "learning_rate": 1.4855099780827004e-05, "loss": 0.8065, "num_input_tokens_seen": 25936584, "step": 44965 }, { "epoch": 6.697944593386953, "grad_norm": 0.2413473129272461, "learning_rate": 1.4849160417319724e-05, "loss": 0.819, "num_input_tokens_seen": 25939464, "step": 44970 }, { "epoch": 6.698689305927912, "grad_norm": 0.3044629395008087, "learning_rate": 1.4843221739764906e-05, "loss": 0.7968, "num_input_tokens_seen": 25942472, "step": 44975 }, { "epoch": 6.699434018468871, "grad_norm": 0.23004662990570068, "learning_rate": 1.483728374856388e-05, "loss": 0.7899, "num_input_tokens_seen": 25945192, "step": 44980 }, { "epoch": 6.70017873100983, "grad_norm": 0.2598574161529541, "learning_rate": 1.4831346444117888e-05, "loss": 0.8434, "num_input_tokens_seen": 25948232, "step": 44985 }, { "epoch": 6.700923443550789, "grad_norm": 0.19517649710178375, "learning_rate": 1.4825409826828169e-05, "loss": 0.7793, "num_input_tokens_seen": 25951112, "step": 44990 }, { "epoch": 6.701668156091749, "grad_norm": 0.19092880189418793, "learning_rate": 1.4819473897095876e-05, "loss": 0.797, "num_input_tokens_seen": 25954056, "step": 44995 }, { "epoch": 6.702412868632708, "grad_norm": 0.34865090250968933, "learning_rate": 1.4813538655322151e-05, "loss": 0.795, "num_input_tokens_seen": 25957096, "step": 45000 }, { "epoch": 6.703157581173667, "grad_norm": 0.16062788665294647, "learning_rate": 1.4807604101908073e-05, "loss": 0.7841, "num_input_tokens_seen": 25959688, "step": 45005 }, { "epoch": 6.703902293714626, "grad_norm": 0.19863887131214142, "learning_rate": 1.4801670237254664e-05, "loss": 0.8155, "num_input_tokens_seen": 25962728, "step": 45010 }, { "epoch": 6.704647006255585, "grad_norm": 0.2912902534008026, "learning_rate": 1.4795737061762918e-05, "loss": 0.8432, "num_input_tokens_seen": 25965480, "step": 45015 }, { "epoch": 6.705391718796545, "grad_norm": 0.2147141993045807, "learning_rate": 1.4789804575833782e-05, "loss": 0.795, "num_input_tokens_seen": 25968488, "step": 45020 }, { "epoch": 6.706136431337503, "grad_norm": 0.2901335060596466, "learning_rate": 1.4783872779868141e-05, "loss": 0.78, "num_input_tokens_seen": 25971496, "step": 45025 }, { "epoch": 6.706881143878463, "grad_norm": 0.20240585505962372, "learning_rate": 1.4777941674266832e-05, "loss": 0.7946, "num_input_tokens_seen": 25974120, "step": 45030 }, { "epoch": 6.707625856419422, "grad_norm": 0.17534692585468292, "learning_rate": 1.4772011259430668e-05, "loss": 0.8105, "num_input_tokens_seen": 25976840, "step": 45035 }, { "epoch": 6.708370568960381, "grad_norm": 0.2982579171657562, "learning_rate": 1.4766081535760401e-05, "loss": 0.7802, "num_input_tokens_seen": 25980104, "step": 45040 }, { "epoch": 6.70911528150134, "grad_norm": 0.2235504388809204, "learning_rate": 1.4760152503656733e-05, "loss": 0.7761, "num_input_tokens_seen": 25982664, "step": 45045 }, { "epoch": 6.7098599940423, "grad_norm": 0.2903013825416565, "learning_rate": 1.4754224163520325e-05, "loss": 0.8266, "num_input_tokens_seen": 25986024, "step": 45050 }, { "epoch": 6.710604706583259, "grad_norm": 0.23397456109523773, "learning_rate": 1.4748296515751797e-05, "loss": 0.8246, "num_input_tokens_seen": 25988808, "step": 45055 }, { "epoch": 6.711349419124218, "grad_norm": 0.2211606204509735, "learning_rate": 1.4742369560751718e-05, "loss": 0.8043, "num_input_tokens_seen": 25991720, "step": 45060 }, { "epoch": 6.712094131665177, "grad_norm": 0.21162331104278564, "learning_rate": 1.4736443298920588e-05, "loss": 0.8166, "num_input_tokens_seen": 25994408, "step": 45065 }, { "epoch": 6.712838844206137, "grad_norm": 0.16807520389556885, "learning_rate": 1.4730517730658888e-05, "loss": 0.8004, "num_input_tokens_seen": 25997128, "step": 45070 }, { "epoch": 6.713583556747095, "grad_norm": 0.3216419219970703, "learning_rate": 1.4724592856367057e-05, "loss": 0.808, "num_input_tokens_seen": 25999880, "step": 45075 }, { "epoch": 6.714328269288055, "grad_norm": 0.2953130900859833, "learning_rate": 1.4718668676445454e-05, "loss": 0.8279, "num_input_tokens_seen": 26002696, "step": 45080 }, { "epoch": 6.715072981829014, "grad_norm": 0.22848300635814667, "learning_rate": 1.4712745191294431e-05, "loss": 0.8237, "num_input_tokens_seen": 26005768, "step": 45085 }, { "epoch": 6.7158176943699734, "grad_norm": 0.256136029958725, "learning_rate": 1.4706822401314254e-05, "loss": 0.8062, "num_input_tokens_seen": 26008872, "step": 45090 }, { "epoch": 6.716562406910932, "grad_norm": 0.26559656858444214, "learning_rate": 1.4700900306905172e-05, "loss": 0.7791, "num_input_tokens_seen": 26011464, "step": 45095 }, { "epoch": 6.717307119451892, "grad_norm": 0.20258010923862457, "learning_rate": 1.4694978908467384e-05, "loss": 0.7942, "num_input_tokens_seen": 26014408, "step": 45100 }, { "epoch": 6.718051831992851, "grad_norm": 0.2792817950248718, "learning_rate": 1.468905820640102e-05, "loss": 0.8213, "num_input_tokens_seen": 26017672, "step": 45105 }, { "epoch": 6.71879654453381, "grad_norm": 0.23871712386608124, "learning_rate": 1.4683138201106175e-05, "loss": 0.8027, "num_input_tokens_seen": 26020744, "step": 45110 }, { "epoch": 6.719541257074769, "grad_norm": 0.2860863506793976, "learning_rate": 1.4677218892982906e-05, "loss": 0.8013, "num_input_tokens_seen": 26024040, "step": 45115 }, { "epoch": 6.720285969615729, "grad_norm": 0.2555076479911804, "learning_rate": 1.467130028243122e-05, "loss": 0.7909, "num_input_tokens_seen": 26026856, "step": 45120 }, { "epoch": 6.721030682156687, "grad_norm": 0.23270997405052185, "learning_rate": 1.466538236985106e-05, "loss": 0.7843, "num_input_tokens_seen": 26029704, "step": 45125 }, { "epoch": 6.721775394697647, "grad_norm": 0.19303728640079498, "learning_rate": 1.4659465155642343e-05, "loss": 0.7925, "num_input_tokens_seen": 26032360, "step": 45130 }, { "epoch": 6.722520107238606, "grad_norm": 0.19042769074440002, "learning_rate": 1.4653548640204934e-05, "loss": 0.8036, "num_input_tokens_seen": 26035336, "step": 45135 }, { "epoch": 6.7232648197795655, "grad_norm": 0.19112861156463623, "learning_rate": 1.4647632823938651e-05, "loss": 0.7943, "num_input_tokens_seen": 26038344, "step": 45140 }, { "epoch": 6.724009532320524, "grad_norm": 0.30258727073669434, "learning_rate": 1.4641717707243238e-05, "loss": 0.8085, "num_input_tokens_seen": 26041096, "step": 45145 }, { "epoch": 6.724754244861483, "grad_norm": 0.15911337733268738, "learning_rate": 1.463580329051843e-05, "loss": 0.7999, "num_input_tokens_seen": 26044008, "step": 45150 }, { "epoch": 6.725498957402443, "grad_norm": 0.1677679419517517, "learning_rate": 1.4629889574163903e-05, "loss": 0.7823, "num_input_tokens_seen": 26046760, "step": 45155 }, { "epoch": 6.726243669943402, "grad_norm": 0.1801857054233551, "learning_rate": 1.4623976558579272e-05, "loss": 0.7985, "num_input_tokens_seen": 26049800, "step": 45160 }, { "epoch": 6.726988382484361, "grad_norm": 0.18351301550865173, "learning_rate": 1.461806424416412e-05, "loss": 0.7996, "num_input_tokens_seen": 26052584, "step": 45165 }, { "epoch": 6.72773309502532, "grad_norm": 0.16836483776569366, "learning_rate": 1.4612152631317976e-05, "loss": 0.7658, "num_input_tokens_seen": 26055720, "step": 45170 }, { "epoch": 6.7284778075662794, "grad_norm": 0.16430531442165375, "learning_rate": 1.4606241720440326e-05, "loss": 0.7923, "num_input_tokens_seen": 26058728, "step": 45175 }, { "epoch": 6.729222520107239, "grad_norm": 0.20852912962436676, "learning_rate": 1.4600331511930609e-05, "loss": 0.7858, "num_input_tokens_seen": 26061544, "step": 45180 }, { "epoch": 6.729967232648198, "grad_norm": 0.1689368188381195, "learning_rate": 1.4594422006188196e-05, "loss": 0.7963, "num_input_tokens_seen": 26064488, "step": 45185 }, { "epoch": 6.730711945189157, "grad_norm": 0.18013940751552582, "learning_rate": 1.458851320361244e-05, "loss": 0.7836, "num_input_tokens_seen": 26067304, "step": 45190 }, { "epoch": 6.731456657730116, "grad_norm": 0.32108408212661743, "learning_rate": 1.458260510460264e-05, "loss": 0.829, "num_input_tokens_seen": 26069896, "step": 45195 }, { "epoch": 6.732201370271075, "grad_norm": 0.16894866526126862, "learning_rate": 1.4576697709558023e-05, "loss": 0.797, "num_input_tokens_seen": 26072904, "step": 45200 }, { "epoch": 6.732946082812035, "grad_norm": 0.25023865699768066, "learning_rate": 1.4570791018877796e-05, "loss": 0.8034, "num_input_tokens_seen": 26075592, "step": 45205 }, { "epoch": 6.733690795352993, "grad_norm": 0.21626341342926025, "learning_rate": 1.4564885032961112e-05, "loss": 0.7807, "num_input_tokens_seen": 26078408, "step": 45210 }, { "epoch": 6.734435507893953, "grad_norm": 0.18267683684825897, "learning_rate": 1.455897975220707e-05, "loss": 0.8119, "num_input_tokens_seen": 26081288, "step": 45215 }, { "epoch": 6.735180220434912, "grad_norm": 0.20563484728336334, "learning_rate": 1.4553075177014736e-05, "loss": 0.7914, "num_input_tokens_seen": 26084424, "step": 45220 }, { "epoch": 6.7359249329758715, "grad_norm": 0.26655539870262146, "learning_rate": 1.4547171307783097e-05, "loss": 0.7912, "num_input_tokens_seen": 26087624, "step": 45225 }, { "epoch": 6.73666964551683, "grad_norm": 0.2604842483997345, "learning_rate": 1.4541268144911135e-05, "loss": 0.7983, "num_input_tokens_seen": 26090536, "step": 45230 }, { "epoch": 6.73741435805779, "grad_norm": 0.20981961488723755, "learning_rate": 1.4535365688797735e-05, "loss": 0.8237, "num_input_tokens_seen": 26093032, "step": 45235 }, { "epoch": 6.738159070598749, "grad_norm": 0.19970282912254333, "learning_rate": 1.452946393984178e-05, "loss": 0.7981, "num_input_tokens_seen": 26095784, "step": 45240 }, { "epoch": 6.738903783139708, "grad_norm": 0.1938350945711136, "learning_rate": 1.4523562898442084e-05, "loss": 0.7797, "num_input_tokens_seen": 26099144, "step": 45245 }, { "epoch": 6.739648495680667, "grad_norm": 0.1816263347864151, "learning_rate": 1.451766256499741e-05, "loss": 0.7826, "num_input_tokens_seen": 26101736, "step": 45250 }, { "epoch": 6.740393208221627, "grad_norm": 0.27014103531837463, "learning_rate": 1.4511762939906481e-05, "loss": 0.7826, "num_input_tokens_seen": 26104584, "step": 45255 }, { "epoch": 6.7411379207625854, "grad_norm": 0.23986583948135376, "learning_rate": 1.4505864023567983e-05, "loss": 0.8013, "num_input_tokens_seen": 26107464, "step": 45260 }, { "epoch": 6.741882633303545, "grad_norm": 0.2713179290294647, "learning_rate": 1.4499965816380525e-05, "loss": 0.7876, "num_input_tokens_seen": 26110088, "step": 45265 }, { "epoch": 6.742627345844504, "grad_norm": 0.27312418818473816, "learning_rate": 1.4494068318742679e-05, "loss": 0.7932, "num_input_tokens_seen": 26112936, "step": 45270 }, { "epoch": 6.7433720583854635, "grad_norm": 0.2517436742782593, "learning_rate": 1.4488171531052982e-05, "loss": 0.8043, "num_input_tokens_seen": 26115784, "step": 45275 }, { "epoch": 6.744116770926422, "grad_norm": 0.264567494392395, "learning_rate": 1.4482275453709915e-05, "loss": 0.8132, "num_input_tokens_seen": 26118760, "step": 45280 }, { "epoch": 6.744861483467382, "grad_norm": 0.2237352728843689, "learning_rate": 1.447638008711191e-05, "loss": 0.7979, "num_input_tokens_seen": 26121448, "step": 45285 }, { "epoch": 6.745606196008341, "grad_norm": 0.2581302523612976, "learning_rate": 1.4470485431657355e-05, "loss": 0.8259, "num_input_tokens_seen": 26124200, "step": 45290 }, { "epoch": 6.7463509085493, "grad_norm": 0.32001379132270813, "learning_rate": 1.446459148774459e-05, "loss": 0.7898, "num_input_tokens_seen": 26127048, "step": 45295 }, { "epoch": 6.747095621090259, "grad_norm": 0.21254006028175354, "learning_rate": 1.4458698255771902e-05, "loss": 0.8067, "num_input_tokens_seen": 26129960, "step": 45300 }, { "epoch": 6.747840333631219, "grad_norm": 0.16810141503810883, "learning_rate": 1.445280573613752e-05, "loss": 0.7968, "num_input_tokens_seen": 26132840, "step": 45305 }, { "epoch": 6.7485850461721775, "grad_norm": 0.1766459345817566, "learning_rate": 1.4446913929239642e-05, "loss": 0.7906, "num_input_tokens_seen": 26135656, "step": 45310 }, { "epoch": 6.749329758713137, "grad_norm": 0.26155468821525574, "learning_rate": 1.4441022835476414e-05, "loss": 0.8248, "num_input_tokens_seen": 26138728, "step": 45315 }, { "epoch": 6.750074471254096, "grad_norm": 0.21745166182518005, "learning_rate": 1.4435132455245934e-05, "loss": 0.8108, "num_input_tokens_seen": 26142120, "step": 45320 }, { "epoch": 6.7508191837950555, "grad_norm": 0.21450310945510864, "learning_rate": 1.4429242788946259e-05, "loss": 0.827, "num_input_tokens_seen": 26145032, "step": 45325 }, { "epoch": 6.751563896336014, "grad_norm": 0.19290199875831604, "learning_rate": 1.442335383697537e-05, "loss": 0.8019, "num_input_tokens_seen": 26148008, "step": 45330 }, { "epoch": 6.752308608876973, "grad_norm": 0.2212921530008316, "learning_rate": 1.4417465599731226e-05, "loss": 0.7983, "num_input_tokens_seen": 26150888, "step": 45335 }, { "epoch": 6.753053321417933, "grad_norm": 0.1966477781534195, "learning_rate": 1.4411578077611743e-05, "loss": 0.7742, "num_input_tokens_seen": 26153896, "step": 45340 }, { "epoch": 6.753798033958892, "grad_norm": 0.2338162660598755, "learning_rate": 1.4405691271014751e-05, "loss": 0.7745, "num_input_tokens_seen": 26156840, "step": 45345 }, { "epoch": 6.754542746499851, "grad_norm": 0.22200606763362885, "learning_rate": 1.4399805180338072e-05, "loss": 0.7911, "num_input_tokens_seen": 26159816, "step": 45350 }, { "epoch": 6.75528745904081, "grad_norm": 0.20222614705562592, "learning_rate": 1.439391980597946e-05, "loss": 0.7898, "num_input_tokens_seen": 26162792, "step": 45355 }, { "epoch": 6.7560321715817695, "grad_norm": 0.25766506791114807, "learning_rate": 1.4388035148336637e-05, "loss": 0.8005, "num_input_tokens_seen": 26165832, "step": 45360 }, { "epoch": 6.756776884122728, "grad_norm": 0.22809235751628876, "learning_rate": 1.4382151207807243e-05, "loss": 0.8047, "num_input_tokens_seen": 26168744, "step": 45365 }, { "epoch": 6.757521596663688, "grad_norm": 0.26154816150665283, "learning_rate": 1.4376267984788902e-05, "loss": 0.8077, "num_input_tokens_seen": 26171560, "step": 45370 }, { "epoch": 6.758266309204647, "grad_norm": 0.1992824524641037, "learning_rate": 1.4370385479679177e-05, "loss": 0.7923, "num_input_tokens_seen": 26174472, "step": 45375 }, { "epoch": 6.759011021745606, "grad_norm": 0.19367942214012146, "learning_rate": 1.4364503692875598e-05, "loss": 0.7961, "num_input_tokens_seen": 26177352, "step": 45380 }, { "epoch": 6.759755734286565, "grad_norm": 0.19618888199329376, "learning_rate": 1.4358622624775603e-05, "loss": 0.7974, "num_input_tokens_seen": 26180136, "step": 45385 }, { "epoch": 6.760500446827525, "grad_norm": 0.2821255028247833, "learning_rate": 1.4352742275776632e-05, "loss": 0.7986, "num_input_tokens_seen": 26182920, "step": 45390 }, { "epoch": 6.7612451593684835, "grad_norm": 0.20667098462581635, "learning_rate": 1.4346862646276055e-05, "loss": 0.8246, "num_input_tokens_seen": 26185832, "step": 45395 }, { "epoch": 6.761989871909443, "grad_norm": 0.21278443932533264, "learning_rate": 1.4340983736671179e-05, "loss": 0.783, "num_input_tokens_seen": 26188520, "step": 45400 }, { "epoch": 6.762734584450402, "grad_norm": 0.2236848920583725, "learning_rate": 1.4335105547359287e-05, "loss": 0.8095, "num_input_tokens_seen": 26191272, "step": 45405 }, { "epoch": 6.7634792969913615, "grad_norm": 0.23296552896499634, "learning_rate": 1.43292280787376e-05, "loss": 0.773, "num_input_tokens_seen": 26194152, "step": 45410 }, { "epoch": 6.76422400953232, "grad_norm": 0.1821817308664322, "learning_rate": 1.4323351331203296e-05, "loss": 0.759, "num_input_tokens_seen": 26196680, "step": 45415 }, { "epoch": 6.76496872207328, "grad_norm": 0.33981257677078247, "learning_rate": 1.431747530515351e-05, "loss": 0.7674, "num_input_tokens_seen": 26199688, "step": 45420 }, { "epoch": 6.765713434614239, "grad_norm": 0.20024637877941132, "learning_rate": 1.4311600000985303e-05, "loss": 0.7909, "num_input_tokens_seen": 26202568, "step": 45425 }, { "epoch": 6.766458147155198, "grad_norm": 0.24452391266822815, "learning_rate": 1.4305725419095722e-05, "loss": 0.8038, "num_input_tokens_seen": 26205736, "step": 45430 }, { "epoch": 6.767202859696157, "grad_norm": 0.19791029393672943, "learning_rate": 1.4299851559881727e-05, "loss": 0.7857, "num_input_tokens_seen": 26208712, "step": 45435 }, { "epoch": 6.767947572237117, "grad_norm": 0.24349230527877808, "learning_rate": 1.4293978423740259e-05, "loss": 0.7959, "num_input_tokens_seen": 26211592, "step": 45440 }, { "epoch": 6.7686922847780755, "grad_norm": 0.2673076093196869, "learning_rate": 1.4288106011068203e-05, "loss": 0.818, "num_input_tokens_seen": 26214504, "step": 45445 }, { "epoch": 6.769436997319035, "grad_norm": 0.16780684888362885, "learning_rate": 1.4282234322262389e-05, "loss": 0.7957, "num_input_tokens_seen": 26217352, "step": 45450 }, { "epoch": 6.770181709859994, "grad_norm": 0.25226080417633057, "learning_rate": 1.4276363357719605e-05, "loss": 0.7739, "num_input_tokens_seen": 26220296, "step": 45455 }, { "epoch": 6.7709264224009535, "grad_norm": 0.1831429898738861, "learning_rate": 1.4270493117836597e-05, "loss": 0.8282, "num_input_tokens_seen": 26222824, "step": 45460 }, { "epoch": 6.771671134941912, "grad_norm": 0.15917448699474335, "learning_rate": 1.4264623603010042e-05, "loss": 0.8134, "num_input_tokens_seen": 26225512, "step": 45465 }, { "epoch": 6.772415847482872, "grad_norm": 0.21199098229408264, "learning_rate": 1.4258754813636565e-05, "loss": 0.7903, "num_input_tokens_seen": 26228296, "step": 45470 }, { "epoch": 6.773160560023831, "grad_norm": 0.24391648173332214, "learning_rate": 1.4252886750112768e-05, "loss": 0.7923, "num_input_tokens_seen": 26231144, "step": 45475 }, { "epoch": 6.77390527256479, "grad_norm": 0.2606872618198395, "learning_rate": 1.4247019412835188e-05, "loss": 0.7892, "num_input_tokens_seen": 26233960, "step": 45480 }, { "epoch": 6.774649985105749, "grad_norm": 0.1610528826713562, "learning_rate": 1.4241152802200319e-05, "loss": 0.8063, "num_input_tokens_seen": 26236584, "step": 45485 }, { "epoch": 6.775394697646709, "grad_norm": 0.14885923266410828, "learning_rate": 1.4235286918604613e-05, "loss": 0.8296, "num_input_tokens_seen": 26239400, "step": 45490 }, { "epoch": 6.7761394101876675, "grad_norm": 0.27794721722602844, "learning_rate": 1.422942176244444e-05, "loss": 0.8335, "num_input_tokens_seen": 26242184, "step": 45495 }, { "epoch": 6.776884122728626, "grad_norm": 0.17992211878299713, "learning_rate": 1.4223557334116167e-05, "loss": 0.8001, "num_input_tokens_seen": 26244808, "step": 45500 }, { "epoch": 6.777628835269586, "grad_norm": 0.1693054884672165, "learning_rate": 1.421769363401606e-05, "loss": 0.79, "num_input_tokens_seen": 26247560, "step": 45505 }, { "epoch": 6.778373547810546, "grad_norm": 0.2133781760931015, "learning_rate": 1.4211830662540381e-05, "loss": 0.8143, "num_input_tokens_seen": 26250536, "step": 45510 }, { "epoch": 6.779118260351504, "grad_norm": 0.28516510128974915, "learning_rate": 1.4205968420085324e-05, "loss": 0.8203, "num_input_tokens_seen": 26253544, "step": 45515 }, { "epoch": 6.779862972892463, "grad_norm": 0.2090965360403061, "learning_rate": 1.4200106907047039e-05, "loss": 0.7911, "num_input_tokens_seen": 26256360, "step": 45520 }, { "epoch": 6.780607685433423, "grad_norm": 0.21569925546646118, "learning_rate": 1.419424612382163e-05, "loss": 0.7893, "num_input_tokens_seen": 26259272, "step": 45525 }, { "epoch": 6.781352397974382, "grad_norm": 0.18020661175251007, "learning_rate": 1.418838607080512e-05, "loss": 0.8054, "num_input_tokens_seen": 26262280, "step": 45530 }, { "epoch": 6.782097110515341, "grad_norm": 0.1985662281513214, "learning_rate": 1.4182526748393526e-05, "loss": 0.8254, "num_input_tokens_seen": 26264968, "step": 45535 }, { "epoch": 6.7828418230563, "grad_norm": 0.21767574548721313, "learning_rate": 1.4176668156982798e-05, "loss": 0.7882, "num_input_tokens_seen": 26268072, "step": 45540 }, { "epoch": 6.7835865355972595, "grad_norm": 0.27299025654792786, "learning_rate": 1.4170810296968834e-05, "loss": 0.8387, "num_input_tokens_seen": 26270920, "step": 45545 }, { "epoch": 6.784331248138218, "grad_norm": 0.2228117138147354, "learning_rate": 1.4164953168747475e-05, "loss": 0.7911, "num_input_tokens_seen": 26273800, "step": 45550 }, { "epoch": 6.785075960679178, "grad_norm": 0.25753530859947205, "learning_rate": 1.4159096772714531e-05, "loss": 0.7684, "num_input_tokens_seen": 26276712, "step": 45555 }, { "epoch": 6.785820673220137, "grad_norm": 0.17311321198940277, "learning_rate": 1.4153241109265759e-05, "loss": 0.7835, "num_input_tokens_seen": 26279400, "step": 45560 }, { "epoch": 6.786565385761096, "grad_norm": 0.20682641863822937, "learning_rate": 1.414738617879684e-05, "loss": 0.8069, "num_input_tokens_seen": 26282344, "step": 45565 }, { "epoch": 6.787310098302055, "grad_norm": 0.24032801389694214, "learning_rate": 1.4141531981703444e-05, "loss": 0.8006, "num_input_tokens_seen": 26285352, "step": 45570 }, { "epoch": 6.788054810843015, "grad_norm": 0.187456876039505, "learning_rate": 1.4135678518381168e-05, "loss": 0.8042, "num_input_tokens_seen": 26288104, "step": 45575 }, { "epoch": 6.7887995233839735, "grad_norm": 0.19611909985542297, "learning_rate": 1.4129825789225564e-05, "loss": 0.8185, "num_input_tokens_seen": 26290824, "step": 45580 }, { "epoch": 6.789544235924933, "grad_norm": 0.29745954275131226, "learning_rate": 1.412397379463215e-05, "loss": 0.8239, "num_input_tokens_seen": 26293832, "step": 45585 }, { "epoch": 6.790288948465892, "grad_norm": 0.22814129292964935, "learning_rate": 1.4118122534996358e-05, "loss": 0.7961, "num_input_tokens_seen": 26296872, "step": 45590 }, { "epoch": 6.791033661006852, "grad_norm": 0.15798543393611908, "learning_rate": 1.4112272010713617e-05, "loss": 0.8237, "num_input_tokens_seen": 26299784, "step": 45595 }, { "epoch": 6.79177837354781, "grad_norm": 0.21764962375164032, "learning_rate": 1.4106422222179252e-05, "loss": 0.8059, "num_input_tokens_seen": 26302440, "step": 45600 }, { "epoch": 6.79252308608877, "grad_norm": 0.30790549516677856, "learning_rate": 1.4100573169788584e-05, "loss": 0.8021, "num_input_tokens_seen": 26304968, "step": 45605 }, { "epoch": 6.793267798629729, "grad_norm": 0.21937967836856842, "learning_rate": 1.4094724853936869e-05, "loss": 0.8049, "num_input_tokens_seen": 26307784, "step": 45610 }, { "epoch": 6.794012511170688, "grad_norm": 0.2103395164012909, "learning_rate": 1.4088877275019311e-05, "loss": 0.7995, "num_input_tokens_seen": 26310728, "step": 45615 }, { "epoch": 6.794757223711647, "grad_norm": 0.23999950289726257, "learning_rate": 1.4083030433431066e-05, "loss": 0.7743, "num_input_tokens_seen": 26313448, "step": 45620 }, { "epoch": 6.795501936252607, "grad_norm": 0.23398476839065552, "learning_rate": 1.4077184329567244e-05, "loss": 0.789, "num_input_tokens_seen": 26316296, "step": 45625 }, { "epoch": 6.7962466487935655, "grad_norm": 0.25962159037590027, "learning_rate": 1.40713389638229e-05, "loss": 0.8054, "num_input_tokens_seen": 26318984, "step": 45630 }, { "epoch": 6.796991361334525, "grad_norm": 0.2317674458026886, "learning_rate": 1.4065494336593027e-05, "loss": 0.7794, "num_input_tokens_seen": 26322088, "step": 45635 }, { "epoch": 6.797736073875484, "grad_norm": 0.3317936360836029, "learning_rate": 1.4059650448272587e-05, "loss": 0.8101, "num_input_tokens_seen": 26325000, "step": 45640 }, { "epoch": 6.798480786416444, "grad_norm": 0.15276913344860077, "learning_rate": 1.4053807299256495e-05, "loss": 0.8161, "num_input_tokens_seen": 26327912, "step": 45645 }, { "epoch": 6.799225498957402, "grad_norm": 0.1720125377178192, "learning_rate": 1.4047964889939596e-05, "loss": 0.7879, "num_input_tokens_seen": 26330536, "step": 45650 }, { "epoch": 6.799970211498362, "grad_norm": 0.2009509801864624, "learning_rate": 1.4042123220716713e-05, "loss": 0.7798, "num_input_tokens_seen": 26333544, "step": 45655 }, { "epoch": 6.800714924039321, "grad_norm": 0.2437688410282135, "learning_rate": 1.4036282291982583e-05, "loss": 0.8315, "num_input_tokens_seen": 26336328, "step": 45660 }, { "epoch": 6.8014596365802795, "grad_norm": 0.1720428615808487, "learning_rate": 1.403044210413193e-05, "loss": 0.8385, "num_input_tokens_seen": 26339144, "step": 45665 }, { "epoch": 6.802204349121239, "grad_norm": 0.17447994649410248, "learning_rate": 1.4024602657559393e-05, "loss": 0.816, "num_input_tokens_seen": 26342184, "step": 45670 }, { "epoch": 6.802949061662199, "grad_norm": 0.19333945214748383, "learning_rate": 1.4018763952659581e-05, "loss": 0.8089, "num_input_tokens_seen": 26345256, "step": 45675 }, { "epoch": 6.803693774203158, "grad_norm": 0.1583136022090912, "learning_rate": 1.4012925989827058e-05, "loss": 0.8439, "num_input_tokens_seen": 26348072, "step": 45680 }, { "epoch": 6.804438486744116, "grad_norm": 0.3662835359573364, "learning_rate": 1.4007088769456326e-05, "loss": 0.8046, "num_input_tokens_seen": 26351144, "step": 45685 }, { "epoch": 6.805183199285076, "grad_norm": 0.1863327920436859, "learning_rate": 1.400125229194185e-05, "loss": 0.8033, "num_input_tokens_seen": 26353960, "step": 45690 }, { "epoch": 6.805927911826036, "grad_norm": 0.1839587688446045, "learning_rate": 1.3995416557678016e-05, "loss": 0.8047, "num_input_tokens_seen": 26356712, "step": 45695 }, { "epoch": 6.806672624366994, "grad_norm": 0.22414632141590118, "learning_rate": 1.398958156705919e-05, "loss": 0.8036, "num_input_tokens_seen": 26359560, "step": 45700 }, { "epoch": 6.807417336907953, "grad_norm": 0.1878689080476761, "learning_rate": 1.3983747320479688e-05, "loss": 0.8224, "num_input_tokens_seen": 26362536, "step": 45705 }, { "epoch": 6.808162049448913, "grad_norm": 0.1476607322692871, "learning_rate": 1.3977913818333744e-05, "loss": 0.8066, "num_input_tokens_seen": 26365480, "step": 45710 }, { "epoch": 6.8089067619898715, "grad_norm": 0.2270842045545578, "learning_rate": 1.3972081061015569e-05, "loss": 0.7938, "num_input_tokens_seen": 26368456, "step": 45715 }, { "epoch": 6.809651474530831, "grad_norm": 0.18511545658111572, "learning_rate": 1.396624904891932e-05, "loss": 0.7972, "num_input_tokens_seen": 26371528, "step": 45720 }, { "epoch": 6.81039618707179, "grad_norm": 0.17181657254695892, "learning_rate": 1.3960417782439112e-05, "loss": 0.8024, "num_input_tokens_seen": 26374088, "step": 45725 }, { "epoch": 6.81114089961275, "grad_norm": 0.2599620223045349, "learning_rate": 1.3954587261968974e-05, "loss": 0.8026, "num_input_tokens_seen": 26376936, "step": 45730 }, { "epoch": 6.811885612153708, "grad_norm": 0.2213534116744995, "learning_rate": 1.3948757487902923e-05, "loss": 0.811, "num_input_tokens_seen": 26379752, "step": 45735 }, { "epoch": 6.812630324694668, "grad_norm": 0.20158067345619202, "learning_rate": 1.3942928460634907e-05, "loss": 0.7496, "num_input_tokens_seen": 26382920, "step": 45740 }, { "epoch": 6.813375037235627, "grad_norm": 0.25427475571632385, "learning_rate": 1.3937100180558846e-05, "loss": 0.7835, "num_input_tokens_seen": 26385960, "step": 45745 }, { "epoch": 6.814119749776586, "grad_norm": 0.1736753135919571, "learning_rate": 1.3931272648068565e-05, "loss": 0.8157, "num_input_tokens_seen": 26388744, "step": 45750 }, { "epoch": 6.814864462317545, "grad_norm": 0.24060426652431488, "learning_rate": 1.3925445863557873e-05, "loss": 0.7897, "num_input_tokens_seen": 26391592, "step": 45755 }, { "epoch": 6.815609174858505, "grad_norm": 0.18257911503314972, "learning_rate": 1.3919619827420538e-05, "loss": 0.7818, "num_input_tokens_seen": 26394504, "step": 45760 }, { "epoch": 6.816353887399464, "grad_norm": 0.22317318618297577, "learning_rate": 1.3913794540050234e-05, "loss": 0.8024, "num_input_tokens_seen": 26397256, "step": 45765 }, { "epoch": 6.817098599940423, "grad_norm": 0.18828508257865906, "learning_rate": 1.390797000184062e-05, "loss": 0.7797, "num_input_tokens_seen": 26400072, "step": 45770 }, { "epoch": 6.817843312481382, "grad_norm": 0.25879794359207153, "learning_rate": 1.3902146213185297e-05, "loss": 0.7586, "num_input_tokens_seen": 26402920, "step": 45775 }, { "epoch": 6.818588025022342, "grad_norm": 0.21548810601234436, "learning_rate": 1.3896323174477815e-05, "loss": 0.8171, "num_input_tokens_seen": 26406056, "step": 45780 }, { "epoch": 6.8193327375633, "grad_norm": 0.19411152601242065, "learning_rate": 1.3890500886111673e-05, "loss": 0.8152, "num_input_tokens_seen": 26409224, "step": 45785 }, { "epoch": 6.82007745010426, "grad_norm": 0.17264199256896973, "learning_rate": 1.3884679348480309e-05, "loss": 0.8145, "num_input_tokens_seen": 26411912, "step": 45790 }, { "epoch": 6.820822162645219, "grad_norm": 0.23539339005947113, "learning_rate": 1.3878858561977131e-05, "loss": 0.809, "num_input_tokens_seen": 26414920, "step": 45795 }, { "epoch": 6.821566875186178, "grad_norm": 0.21187543869018555, "learning_rate": 1.3873038526995466e-05, "loss": 0.8052, "num_input_tokens_seen": 26417768, "step": 45800 }, { "epoch": 6.822311587727137, "grad_norm": 0.22269722819328308, "learning_rate": 1.386721924392862e-05, "loss": 0.832, "num_input_tokens_seen": 26420744, "step": 45805 }, { "epoch": 6.823056300268097, "grad_norm": 0.2171822190284729, "learning_rate": 1.3861400713169831e-05, "loss": 0.8042, "num_input_tokens_seen": 26423560, "step": 45810 }, { "epoch": 6.823801012809056, "grad_norm": 0.2629086375236511, "learning_rate": 1.38555829351123e-05, "loss": 0.8028, "num_input_tokens_seen": 26426440, "step": 45815 }, { "epoch": 6.824545725350015, "grad_norm": 0.2739885151386261, "learning_rate": 1.384976591014917e-05, "loss": 0.7659, "num_input_tokens_seen": 26429416, "step": 45820 }, { "epoch": 6.825290437890974, "grad_norm": 0.18736569583415985, "learning_rate": 1.384394963867352e-05, "loss": 0.7881, "num_input_tokens_seen": 26432104, "step": 45825 }, { "epoch": 6.826035150431934, "grad_norm": 0.216889888048172, "learning_rate": 1.3838134121078403e-05, "loss": 0.7773, "num_input_tokens_seen": 26435048, "step": 45830 }, { "epoch": 6.826779862972892, "grad_norm": 0.290731817483902, "learning_rate": 1.3832319357756793e-05, "loss": 0.7958, "num_input_tokens_seen": 26437832, "step": 45835 }, { "epoch": 6.827524575513852, "grad_norm": 0.2121756225824356, "learning_rate": 1.3826505349101637e-05, "loss": 0.8158, "num_input_tokens_seen": 26440392, "step": 45840 }, { "epoch": 6.828269288054811, "grad_norm": 0.22720645368099213, "learning_rate": 1.3820692095505819e-05, "loss": 0.7945, "num_input_tokens_seen": 26443048, "step": 45845 }, { "epoch": 6.82901400059577, "grad_norm": 0.24124173820018768, "learning_rate": 1.381487959736218e-05, "loss": 0.826, "num_input_tokens_seen": 26446120, "step": 45850 }, { "epoch": 6.829758713136729, "grad_norm": 0.16330979764461517, "learning_rate": 1.3809067855063512e-05, "loss": 0.8013, "num_input_tokens_seen": 26449128, "step": 45855 }, { "epoch": 6.830503425677689, "grad_norm": 0.22903487086296082, "learning_rate": 1.3803256869002529e-05, "loss": 0.7908, "num_input_tokens_seen": 26452264, "step": 45860 }, { "epoch": 6.831248138218648, "grad_norm": 0.2639271318912506, "learning_rate": 1.379744663957193e-05, "loss": 0.7872, "num_input_tokens_seen": 26455176, "step": 45865 }, { "epoch": 6.831992850759606, "grad_norm": 0.19240039587020874, "learning_rate": 1.3791637167164337e-05, "loss": 0.8229, "num_input_tokens_seen": 26457928, "step": 45870 }, { "epoch": 6.832737563300566, "grad_norm": 0.24592728912830353, "learning_rate": 1.3785828452172333e-05, "loss": 0.7582, "num_input_tokens_seen": 26462472, "step": 45875 }, { "epoch": 6.833482275841525, "grad_norm": 0.33237066864967346, "learning_rate": 1.3780020494988446e-05, "loss": 0.7817, "num_input_tokens_seen": 26465224, "step": 45880 }, { "epoch": 6.834226988382484, "grad_norm": 0.2755981683731079, "learning_rate": 1.3774213296005159e-05, "loss": 0.7718, "num_input_tokens_seen": 26467976, "step": 45885 }, { "epoch": 6.834971700923443, "grad_norm": 0.23297666013240814, "learning_rate": 1.3768406855614907e-05, "loss": 0.8178, "num_input_tokens_seen": 26470888, "step": 45890 }, { "epoch": 6.835716413464403, "grad_norm": 0.21986941993236542, "learning_rate": 1.3762601174210044e-05, "loss": 0.7784, "num_input_tokens_seen": 26473864, "step": 45895 }, { "epoch": 6.836461126005362, "grad_norm": 0.18675051629543304, "learning_rate": 1.3756796252182907e-05, "loss": 0.7983, "num_input_tokens_seen": 26476648, "step": 45900 }, { "epoch": 6.837205838546321, "grad_norm": 0.18249154090881348, "learning_rate": 1.3750992089925777e-05, "loss": 0.8124, "num_input_tokens_seen": 26479336, "step": 45905 }, { "epoch": 6.83795055108728, "grad_norm": 0.2598186135292053, "learning_rate": 1.3745188687830857e-05, "loss": 0.7726, "num_input_tokens_seen": 26482120, "step": 45910 }, { "epoch": 6.83869526362824, "grad_norm": 0.22271013259887695, "learning_rate": 1.3739386046290326e-05, "loss": 0.8347, "num_input_tokens_seen": 26485064, "step": 45915 }, { "epoch": 6.839439976169198, "grad_norm": 0.18642057478427887, "learning_rate": 1.3733584165696304e-05, "loss": 0.7942, "num_input_tokens_seen": 26487848, "step": 45920 }, { "epoch": 6.840184688710158, "grad_norm": 0.21397756040096283, "learning_rate": 1.3727783046440868e-05, "loss": 0.8399, "num_input_tokens_seen": 26491048, "step": 45925 }, { "epoch": 6.840929401251117, "grad_norm": 0.16913552582263947, "learning_rate": 1.3721982688916014e-05, "loss": 0.8192, "num_input_tokens_seen": 26493640, "step": 45930 }, { "epoch": 6.8416741137920765, "grad_norm": 0.2298426330089569, "learning_rate": 1.3716183093513717e-05, "loss": 0.8149, "num_input_tokens_seen": 26496488, "step": 45935 }, { "epoch": 6.842418826333035, "grad_norm": 0.21964344382286072, "learning_rate": 1.3710384260625891e-05, "loss": 0.7931, "num_input_tokens_seen": 26499720, "step": 45940 }, { "epoch": 6.843163538873995, "grad_norm": 0.25241798162460327, "learning_rate": 1.3704586190644405e-05, "loss": 0.7798, "num_input_tokens_seen": 26502632, "step": 45945 }, { "epoch": 6.843908251414954, "grad_norm": 0.21263842284679413, "learning_rate": 1.369878888396105e-05, "loss": 0.7851, "num_input_tokens_seen": 26505576, "step": 45950 }, { "epoch": 6.844652963955913, "grad_norm": 0.19819289445877075, "learning_rate": 1.3692992340967598e-05, "loss": 0.7941, "num_input_tokens_seen": 26508264, "step": 45955 }, { "epoch": 6.845397676496872, "grad_norm": 0.19291862845420837, "learning_rate": 1.368719656205576e-05, "loss": 0.8131, "num_input_tokens_seen": 26511144, "step": 45960 }, { "epoch": 6.846142389037832, "grad_norm": 0.244570791721344, "learning_rate": 1.3681401547617173e-05, "loss": 0.8077, "num_input_tokens_seen": 26513864, "step": 45965 }, { "epoch": 6.84688710157879, "grad_norm": 0.27974438667297363, "learning_rate": 1.3675607298043453e-05, "loss": 0.787, "num_input_tokens_seen": 26516744, "step": 45970 }, { "epoch": 6.84763181411975, "grad_norm": 0.23306472599506378, "learning_rate": 1.3669813813726151e-05, "loss": 0.7778, "num_input_tokens_seen": 26519688, "step": 45975 }, { "epoch": 6.848376526660709, "grad_norm": 0.2674541175365448, "learning_rate": 1.3664021095056764e-05, "loss": 0.8005, "num_input_tokens_seen": 26522504, "step": 45980 }, { "epoch": 6.8491212392016685, "grad_norm": 0.17079167068004608, "learning_rate": 1.3658229142426754e-05, "loss": 0.7993, "num_input_tokens_seen": 26525224, "step": 45985 }, { "epoch": 6.849865951742627, "grad_norm": 0.16222704946994781, "learning_rate": 1.3652437956227496e-05, "loss": 0.8081, "num_input_tokens_seen": 26527944, "step": 45990 }, { "epoch": 6.850610664283587, "grad_norm": 0.20453336834907532, "learning_rate": 1.3646647536850354e-05, "loss": 0.7919, "num_input_tokens_seen": 26530728, "step": 45995 }, { "epoch": 6.851355376824546, "grad_norm": 0.16197237372398376, "learning_rate": 1.3640857884686603e-05, "loss": 0.7927, "num_input_tokens_seen": 26533416, "step": 46000 }, { "epoch": 6.852100089365505, "grad_norm": 0.2212846428155899, "learning_rate": 1.3635069000127493e-05, "loss": 0.7812, "num_input_tokens_seen": 26536488, "step": 46005 }, { "epoch": 6.852844801906464, "grad_norm": 0.19666580855846405, "learning_rate": 1.3629280883564217e-05, "loss": 0.8033, "num_input_tokens_seen": 26539144, "step": 46010 }, { "epoch": 6.853589514447423, "grad_norm": 0.15254631638526917, "learning_rate": 1.3623493535387905e-05, "loss": 0.7788, "num_input_tokens_seen": 26541992, "step": 46015 }, { "epoch": 6.8543342269883825, "grad_norm": 0.17397725582122803, "learning_rate": 1.3617706955989656e-05, "loss": 0.784, "num_input_tokens_seen": 26544840, "step": 46020 }, { "epoch": 6.855078939529342, "grad_norm": 0.22434455156326294, "learning_rate": 1.3611921145760487e-05, "loss": 0.7651, "num_input_tokens_seen": 26548040, "step": 46025 }, { "epoch": 6.855823652070301, "grad_norm": 0.2207544893026352, "learning_rate": 1.3606136105091393e-05, "loss": 0.7988, "num_input_tokens_seen": 26550888, "step": 46030 }, { "epoch": 6.85656836461126, "grad_norm": 0.26816117763519287, "learning_rate": 1.3600351834373286e-05, "loss": 0.7905, "num_input_tokens_seen": 26553608, "step": 46035 }, { "epoch": 6.857313077152219, "grad_norm": 0.2965819835662842, "learning_rate": 1.3594568333997059e-05, "loss": 0.7896, "num_input_tokens_seen": 26556936, "step": 46040 }, { "epoch": 6.858057789693179, "grad_norm": 0.24027694761753082, "learning_rate": 1.3588785604353532e-05, "loss": 0.817, "num_input_tokens_seen": 26559624, "step": 46045 }, { "epoch": 6.858802502234138, "grad_norm": 0.24227161705493927, "learning_rate": 1.3583003645833478e-05, "loss": 0.8054, "num_input_tokens_seen": 26562280, "step": 46050 }, { "epoch": 6.859547214775096, "grad_norm": 0.22413566708564758, "learning_rate": 1.3577222458827628e-05, "loss": 0.7921, "num_input_tokens_seen": 26565256, "step": 46055 }, { "epoch": 6.860291927316056, "grad_norm": 0.20840582251548767, "learning_rate": 1.3571442043726634e-05, "loss": 0.8054, "num_input_tokens_seen": 26568040, "step": 46060 }, { "epoch": 6.861036639857015, "grad_norm": 0.3042868375778198, "learning_rate": 1.356566240092113e-05, "loss": 0.8122, "num_input_tokens_seen": 26570792, "step": 46065 }, { "epoch": 6.8617813523979745, "grad_norm": 0.23587200045585632, "learning_rate": 1.3559883530801667e-05, "loss": 0.8317, "num_input_tokens_seen": 26573640, "step": 46070 }, { "epoch": 6.862526064938933, "grad_norm": 0.1758110225200653, "learning_rate": 1.355410543375876e-05, "loss": 0.7837, "num_input_tokens_seen": 26576424, "step": 46075 }, { "epoch": 6.863270777479893, "grad_norm": 0.212397500872612, "learning_rate": 1.3548328110182873e-05, "loss": 0.801, "num_input_tokens_seen": 26579272, "step": 46080 }, { "epoch": 6.864015490020852, "grad_norm": 0.21360188722610474, "learning_rate": 1.3542551560464412e-05, "loss": 0.7839, "num_input_tokens_seen": 26581992, "step": 46085 }, { "epoch": 6.864760202561811, "grad_norm": 0.23360715806484222, "learning_rate": 1.3536775784993744e-05, "loss": 0.8214, "num_input_tokens_seen": 26584616, "step": 46090 }, { "epoch": 6.86550491510277, "grad_norm": 0.15504024922847748, "learning_rate": 1.3531000784161152e-05, "loss": 0.8079, "num_input_tokens_seen": 26587336, "step": 46095 }, { "epoch": 6.86624962764373, "grad_norm": 0.2848031520843506, "learning_rate": 1.3525226558356895e-05, "loss": 0.7733, "num_input_tokens_seen": 26590120, "step": 46100 }, { "epoch": 6.8669943401846885, "grad_norm": 0.19376161694526672, "learning_rate": 1.3519453107971191e-05, "loss": 0.766, "num_input_tokens_seen": 26592520, "step": 46105 }, { "epoch": 6.867739052725648, "grad_norm": 0.18244002759456635, "learning_rate": 1.3513680433394154e-05, "loss": 0.7847, "num_input_tokens_seen": 26595432, "step": 46110 }, { "epoch": 6.868483765266607, "grad_norm": 0.172439843416214, "learning_rate": 1.3507908535015895e-05, "loss": 0.814, "num_input_tokens_seen": 26598216, "step": 46115 }, { "epoch": 6.8692284778075665, "grad_norm": 0.20695200562477112, "learning_rate": 1.3502137413226453e-05, "loss": 0.7964, "num_input_tokens_seen": 26600904, "step": 46120 }, { "epoch": 6.869973190348525, "grad_norm": 0.18088124692440033, "learning_rate": 1.349636706841583e-05, "loss": 0.8244, "num_input_tokens_seen": 26603496, "step": 46125 }, { "epoch": 6.870717902889485, "grad_norm": 0.2282354086637497, "learning_rate": 1.349059750097394e-05, "loss": 0.8194, "num_input_tokens_seen": 26606440, "step": 46130 }, { "epoch": 6.871462615430444, "grad_norm": 0.19939178228378296, "learning_rate": 1.3484828711290676e-05, "loss": 0.8149, "num_input_tokens_seen": 26609320, "step": 46135 }, { "epoch": 6.872207327971403, "grad_norm": 0.2222711443901062, "learning_rate": 1.347906069975587e-05, "loss": 0.8286, "num_input_tokens_seen": 26612264, "step": 46140 }, { "epoch": 6.872952040512362, "grad_norm": 0.2075221985578537, "learning_rate": 1.34732934667593e-05, "loss": 0.767, "num_input_tokens_seen": 26615208, "step": 46145 }, { "epoch": 6.873696753053322, "grad_norm": 0.21016280353069305, "learning_rate": 1.3467527012690707e-05, "loss": 0.792, "num_input_tokens_seen": 26617992, "step": 46150 }, { "epoch": 6.8744414655942805, "grad_norm": 0.2614869177341461, "learning_rate": 1.3461761337939736e-05, "loss": 0.813, "num_input_tokens_seen": 26621160, "step": 46155 }, { "epoch": 6.87518617813524, "grad_norm": 0.2272542417049408, "learning_rate": 1.3455996442896036e-05, "loss": 0.8039, "num_input_tokens_seen": 26623912, "step": 46160 }, { "epoch": 6.875930890676199, "grad_norm": 0.2658758759498596, "learning_rate": 1.345023232794915e-05, "loss": 0.7784, "num_input_tokens_seen": 26626536, "step": 46165 }, { "epoch": 6.8766756032171585, "grad_norm": 0.19189095497131348, "learning_rate": 1.3444468993488607e-05, "loss": 0.8264, "num_input_tokens_seen": 26629640, "step": 46170 }, { "epoch": 6.877420315758117, "grad_norm": 0.25001800060272217, "learning_rate": 1.3438706439903866e-05, "loss": 0.7844, "num_input_tokens_seen": 26632680, "step": 46175 }, { "epoch": 6.878165028299077, "grad_norm": 0.16249056160449982, "learning_rate": 1.343294466758434e-05, "loss": 0.7943, "num_input_tokens_seen": 26635368, "step": 46180 }, { "epoch": 6.878909740840036, "grad_norm": 0.22166527807712555, "learning_rate": 1.3427183676919396e-05, "loss": 0.8164, "num_input_tokens_seen": 26638248, "step": 46185 }, { "epoch": 6.879654453380995, "grad_norm": 0.23133127391338348, "learning_rate": 1.3421423468298316e-05, "loss": 0.7982, "num_input_tokens_seen": 26641032, "step": 46190 }, { "epoch": 6.880399165921954, "grad_norm": 0.35037243366241455, "learning_rate": 1.3415664042110376e-05, "loss": 0.8102, "num_input_tokens_seen": 26643944, "step": 46195 }, { "epoch": 6.881143878462913, "grad_norm": 0.2270197719335556, "learning_rate": 1.3409905398744748e-05, "loss": 0.785, "num_input_tokens_seen": 26646696, "step": 46200 }, { "epoch": 6.8818885910038725, "grad_norm": 0.2252994030714035, "learning_rate": 1.3404147538590595e-05, "loss": 0.8067, "num_input_tokens_seen": 26649640, "step": 46205 }, { "epoch": 6.882633303544832, "grad_norm": 0.21979743242263794, "learning_rate": 1.3398390462037002e-05, "loss": 0.8157, "num_input_tokens_seen": 26652456, "step": 46210 }, { "epoch": 6.883378016085791, "grad_norm": 0.20239131152629852, "learning_rate": 1.3392634169473018e-05, "loss": 0.7795, "num_input_tokens_seen": 26655176, "step": 46215 }, { "epoch": 6.88412272862675, "grad_norm": 0.17317134141921997, "learning_rate": 1.338687866128763e-05, "loss": 0.8157, "num_input_tokens_seen": 26658088, "step": 46220 }, { "epoch": 6.884867441167709, "grad_norm": 0.21483394503593445, "learning_rate": 1.3381123937869758e-05, "loss": 0.8101, "num_input_tokens_seen": 26660936, "step": 46225 }, { "epoch": 6.885612153708668, "grad_norm": 0.2505674958229065, "learning_rate": 1.33753699996083e-05, "loss": 0.8101, "num_input_tokens_seen": 26663656, "step": 46230 }, { "epoch": 6.886356866249628, "grad_norm": 0.18177856504917145, "learning_rate": 1.3369616846892069e-05, "loss": 0.8355, "num_input_tokens_seen": 26666568, "step": 46235 }, { "epoch": 6.8871015787905865, "grad_norm": 0.2141764760017395, "learning_rate": 1.3363864480109842e-05, "loss": 0.8205, "num_input_tokens_seen": 26669672, "step": 46240 }, { "epoch": 6.887846291331546, "grad_norm": 0.23583906888961792, "learning_rate": 1.3358112899650345e-05, "loss": 0.7995, "num_input_tokens_seen": 26672648, "step": 46245 }, { "epoch": 6.888591003872505, "grad_norm": 0.1631702333688736, "learning_rate": 1.3352362105902246e-05, "loss": 0.8092, "num_input_tokens_seen": 26675656, "step": 46250 }, { "epoch": 6.8893357164134645, "grad_norm": 0.22957094013690948, "learning_rate": 1.3346612099254172e-05, "loss": 0.7833, "num_input_tokens_seen": 26678568, "step": 46255 }, { "epoch": 6.890080428954423, "grad_norm": 0.2619989216327667, "learning_rate": 1.3340862880094661e-05, "loss": 0.778, "num_input_tokens_seen": 26681224, "step": 46260 }, { "epoch": 6.890825141495383, "grad_norm": 0.2375825047492981, "learning_rate": 1.3335114448812235e-05, "loss": 0.7848, "num_input_tokens_seen": 26684232, "step": 46265 }, { "epoch": 6.891569854036342, "grad_norm": 0.26481491327285767, "learning_rate": 1.3329366805795357e-05, "loss": 0.7926, "num_input_tokens_seen": 26686984, "step": 46270 }, { "epoch": 6.892314566577301, "grad_norm": 0.18590089678764343, "learning_rate": 1.3323619951432415e-05, "loss": 0.8047, "num_input_tokens_seen": 26689800, "step": 46275 }, { "epoch": 6.89305927911826, "grad_norm": 0.2994784414768219, "learning_rate": 1.3317873886111759e-05, "loss": 0.777, "num_input_tokens_seen": 26692808, "step": 46280 }, { "epoch": 6.89380399165922, "grad_norm": 0.2584279179573059, "learning_rate": 1.331212861022169e-05, "loss": 0.7854, "num_input_tokens_seen": 26695432, "step": 46285 }, { "epoch": 6.8945487042001785, "grad_norm": 0.28193795680999756, "learning_rate": 1.3306384124150464e-05, "loss": 0.7783, "num_input_tokens_seen": 26698376, "step": 46290 }, { "epoch": 6.895293416741138, "grad_norm": 0.2734445333480835, "learning_rate": 1.3300640428286244e-05, "loss": 0.8176, "num_input_tokens_seen": 26701128, "step": 46295 }, { "epoch": 6.896038129282097, "grad_norm": 0.2163640707731247, "learning_rate": 1.3294897523017177e-05, "loss": 0.8223, "num_input_tokens_seen": 26703912, "step": 46300 }, { "epoch": 6.896782841823057, "grad_norm": 0.2954096496105194, "learning_rate": 1.3289155408731346e-05, "loss": 0.7848, "num_input_tokens_seen": 26707080, "step": 46305 }, { "epoch": 6.897527554364015, "grad_norm": 0.20078788697719574, "learning_rate": 1.3283414085816793e-05, "loss": 0.7703, "num_input_tokens_seen": 26709768, "step": 46310 }, { "epoch": 6.898272266904975, "grad_norm": 0.21906140446662903, "learning_rate": 1.3277673554661466e-05, "loss": 0.7719, "num_input_tokens_seen": 26712840, "step": 46315 }, { "epoch": 6.899016979445934, "grad_norm": 0.23746897280216217, "learning_rate": 1.3271933815653303e-05, "loss": 0.845, "num_input_tokens_seen": 26715752, "step": 46320 }, { "epoch": 6.899761691986893, "grad_norm": 0.27559345960617065, "learning_rate": 1.3266194869180176e-05, "loss": 0.8319, "num_input_tokens_seen": 26718760, "step": 46325 }, { "epoch": 6.900506404527852, "grad_norm": 0.21264362335205078, "learning_rate": 1.3260456715629888e-05, "loss": 0.7798, "num_input_tokens_seen": 26722120, "step": 46330 }, { "epoch": 6.901251117068812, "grad_norm": 0.21194733679294586, "learning_rate": 1.3254719355390206e-05, "loss": 0.8116, "num_input_tokens_seen": 26724936, "step": 46335 }, { "epoch": 6.9019958296097705, "grad_norm": 0.2547944486141205, "learning_rate": 1.3248982788848832e-05, "loss": 0.7898, "num_input_tokens_seen": 26727720, "step": 46340 }, { "epoch": 6.90274054215073, "grad_norm": 0.15540434420108795, "learning_rate": 1.3243247016393429e-05, "loss": 0.8193, "num_input_tokens_seen": 26730536, "step": 46345 }, { "epoch": 6.903485254691689, "grad_norm": 0.30086028575897217, "learning_rate": 1.3237512038411604e-05, "loss": 0.8083, "num_input_tokens_seen": 26733448, "step": 46350 }, { "epoch": 6.904229967232649, "grad_norm": 0.18601734936237335, "learning_rate": 1.3231777855290878e-05, "loss": 0.7715, "num_input_tokens_seen": 26737608, "step": 46355 }, { "epoch": 6.904974679773607, "grad_norm": 0.18703104555606842, "learning_rate": 1.3226044467418771e-05, "loss": 0.823, "num_input_tokens_seen": 26740360, "step": 46360 }, { "epoch": 6.905719392314566, "grad_norm": 0.21059037744998932, "learning_rate": 1.3220311875182701e-05, "loss": 0.8256, "num_input_tokens_seen": 26743240, "step": 46365 }, { "epoch": 6.906464104855526, "grad_norm": 0.19055375456809998, "learning_rate": 1.3214580078970063e-05, "loss": 0.7799, "num_input_tokens_seen": 26746216, "step": 46370 }, { "epoch": 6.907208817396485, "grad_norm": 0.29207780957221985, "learning_rate": 1.3208849079168184e-05, "loss": 0.7908, "num_input_tokens_seen": 26749576, "step": 46375 }, { "epoch": 6.907953529937444, "grad_norm": 0.20105880498886108, "learning_rate": 1.3203118876164345e-05, "loss": 0.8106, "num_input_tokens_seen": 26752648, "step": 46380 }, { "epoch": 6.908698242478403, "grad_norm": 0.2321857213973999, "learning_rate": 1.3197389470345778e-05, "loss": 0.8123, "num_input_tokens_seen": 26755272, "step": 46385 }, { "epoch": 6.909442955019363, "grad_norm": 0.1993979811668396, "learning_rate": 1.3191660862099647e-05, "loss": 0.8083, "num_input_tokens_seen": 26757768, "step": 46390 }, { "epoch": 6.910187667560322, "grad_norm": 0.18628600239753723, "learning_rate": 1.3185933051813057e-05, "loss": 0.8011, "num_input_tokens_seen": 26760488, "step": 46395 }, { "epoch": 6.910932380101281, "grad_norm": 0.1912783980369568, "learning_rate": 1.3180206039873078e-05, "loss": 0.8336, "num_input_tokens_seen": 26763464, "step": 46400 }, { "epoch": 6.91167709264224, "grad_norm": 0.2779353857040405, "learning_rate": 1.317447982666672e-05, "loss": 0.7996, "num_input_tokens_seen": 26766472, "step": 46405 }, { "epoch": 6.912421805183199, "grad_norm": 0.2252514362335205, "learning_rate": 1.3168754412580934e-05, "loss": 0.8277, "num_input_tokens_seen": 26769416, "step": 46410 }, { "epoch": 6.913166517724158, "grad_norm": 0.16862955689430237, "learning_rate": 1.3163029798002625e-05, "loss": 0.8114, "num_input_tokens_seen": 26772232, "step": 46415 }, { "epoch": 6.913911230265118, "grad_norm": 0.21578645706176758, "learning_rate": 1.3157305983318643e-05, "loss": 0.7967, "num_input_tokens_seen": 26775080, "step": 46420 }, { "epoch": 6.9146559428060765, "grad_norm": 0.3115914762020111, "learning_rate": 1.3151582968915766e-05, "loss": 0.7898, "num_input_tokens_seen": 26777832, "step": 46425 }, { "epoch": 6.915400655347036, "grad_norm": 0.21567216515541077, "learning_rate": 1.314586075518075e-05, "loss": 0.8017, "num_input_tokens_seen": 26780584, "step": 46430 }, { "epoch": 6.916145367887995, "grad_norm": 0.22816668450832367, "learning_rate": 1.3140139342500257e-05, "loss": 0.7919, "num_input_tokens_seen": 26783400, "step": 46435 }, { "epoch": 6.916890080428955, "grad_norm": 0.18700425326824188, "learning_rate": 1.3134418731260931e-05, "loss": 0.7914, "num_input_tokens_seen": 26786472, "step": 46440 }, { "epoch": 6.917634792969913, "grad_norm": 0.30579808354377747, "learning_rate": 1.3128698921849344e-05, "loss": 0.766, "num_input_tokens_seen": 26789192, "step": 46445 }, { "epoch": 6.918379505510873, "grad_norm": 0.30704066157341003, "learning_rate": 1.3122979914652016e-05, "loss": 0.7728, "num_input_tokens_seen": 26792488, "step": 46450 }, { "epoch": 6.919124218051832, "grad_norm": 0.2139444798231125, "learning_rate": 1.3117261710055433e-05, "loss": 0.8051, "num_input_tokens_seen": 26795336, "step": 46455 }, { "epoch": 6.919868930592791, "grad_norm": 0.24007035791873932, "learning_rate": 1.3111544308445977e-05, "loss": 0.8112, "num_input_tokens_seen": 26798248, "step": 46460 }, { "epoch": 6.92061364313375, "grad_norm": 0.24863409996032715, "learning_rate": 1.3105827710210026e-05, "loss": 0.7966, "num_input_tokens_seen": 26800872, "step": 46465 }, { "epoch": 6.92135835567471, "grad_norm": 0.198043555021286, "learning_rate": 1.3100111915733887e-05, "loss": 0.8182, "num_input_tokens_seen": 26803720, "step": 46470 }, { "epoch": 6.922103068215669, "grad_norm": 0.16347306966781616, "learning_rate": 1.3094396925403793e-05, "loss": 0.8164, "num_input_tokens_seen": 26806728, "step": 46475 }, { "epoch": 6.922847780756628, "grad_norm": 0.209596186876297, "learning_rate": 1.308868273960595e-05, "loss": 0.8027, "num_input_tokens_seen": 26809320, "step": 46480 }, { "epoch": 6.923592493297587, "grad_norm": 0.19185590744018555, "learning_rate": 1.3082969358726502e-05, "loss": 0.792, "num_input_tokens_seen": 26812072, "step": 46485 }, { "epoch": 6.924337205838547, "grad_norm": 0.2376769483089447, "learning_rate": 1.3077256783151542e-05, "loss": 0.792, "num_input_tokens_seen": 26815048, "step": 46490 }, { "epoch": 6.925081918379505, "grad_norm": 0.2587795555591583, "learning_rate": 1.3071545013267084e-05, "loss": 0.8185, "num_input_tokens_seen": 26818248, "step": 46495 }, { "epoch": 6.925826630920465, "grad_norm": 0.22216372191905975, "learning_rate": 1.3065834049459117e-05, "loss": 0.8215, "num_input_tokens_seen": 26820968, "step": 46500 }, { "epoch": 6.926571343461424, "grad_norm": 0.3385283946990967, "learning_rate": 1.3060123892113562e-05, "loss": 0.8085, "num_input_tokens_seen": 26823912, "step": 46505 }, { "epoch": 6.927316056002383, "grad_norm": 0.1604958325624466, "learning_rate": 1.3054414541616305e-05, "loss": 0.7799, "num_input_tokens_seen": 26826632, "step": 46510 }, { "epoch": 6.928060768543342, "grad_norm": 0.34206727147102356, "learning_rate": 1.3048705998353133e-05, "loss": 0.832, "num_input_tokens_seen": 26829480, "step": 46515 }, { "epoch": 6.928805481084302, "grad_norm": 0.22798597812652588, "learning_rate": 1.3042998262709821e-05, "loss": 0.8014, "num_input_tokens_seen": 26832168, "step": 46520 }, { "epoch": 6.929550193625261, "grad_norm": 0.18224747478961945, "learning_rate": 1.303729133507208e-05, "loss": 0.7788, "num_input_tokens_seen": 26835080, "step": 46525 }, { "epoch": 6.930294906166219, "grad_norm": 0.25146037340164185, "learning_rate": 1.3031585215825545e-05, "loss": 0.8085, "num_input_tokens_seen": 26838088, "step": 46530 }, { "epoch": 6.931039618707179, "grad_norm": 0.1904793530702591, "learning_rate": 1.302587990535582e-05, "loss": 0.7854, "num_input_tokens_seen": 26840872, "step": 46535 }, { "epoch": 6.931784331248139, "grad_norm": 0.2154032438993454, "learning_rate": 1.302017540404845e-05, "loss": 0.7815, "num_input_tokens_seen": 26843880, "step": 46540 }, { "epoch": 6.932529043789097, "grad_norm": 0.2661038041114807, "learning_rate": 1.3014471712288917e-05, "loss": 0.7893, "num_input_tokens_seen": 26846856, "step": 46545 }, { "epoch": 6.933273756330056, "grad_norm": 0.1858997941017151, "learning_rate": 1.300876883046267e-05, "loss": 0.8072, "num_input_tokens_seen": 26849640, "step": 46550 }, { "epoch": 6.934018468871016, "grad_norm": 0.3402250409126282, "learning_rate": 1.3003066758955068e-05, "loss": 0.8044, "num_input_tokens_seen": 26852648, "step": 46555 }, { "epoch": 6.9347631814119755, "grad_norm": 0.25407683849334717, "learning_rate": 1.2997365498151431e-05, "loss": 0.8094, "num_input_tokens_seen": 26855560, "step": 46560 }, { "epoch": 6.935507893952934, "grad_norm": 0.24407747387886047, "learning_rate": 1.2991665048437036e-05, "loss": 0.8029, "num_input_tokens_seen": 26858472, "step": 46565 }, { "epoch": 6.936252606493893, "grad_norm": 0.2699047327041626, "learning_rate": 1.2985965410197092e-05, "loss": 0.7945, "num_input_tokens_seen": 26861096, "step": 46570 }, { "epoch": 6.936997319034853, "grad_norm": 0.2844318151473999, "learning_rate": 1.2980266583816763e-05, "loss": 0.7857, "num_input_tokens_seen": 26864008, "step": 46575 }, { "epoch": 6.937742031575811, "grad_norm": 0.2119852602481842, "learning_rate": 1.2974568569681147e-05, "loss": 0.774, "num_input_tokens_seen": 26866760, "step": 46580 }, { "epoch": 6.938486744116771, "grad_norm": 0.24221071600914001, "learning_rate": 1.2968871368175306e-05, "loss": 0.7902, "num_input_tokens_seen": 26869416, "step": 46585 }, { "epoch": 6.93923145665773, "grad_norm": 0.19058121740818024, "learning_rate": 1.2963174979684223e-05, "loss": 0.7999, "num_input_tokens_seen": 26872296, "step": 46590 }, { "epoch": 6.939976169198689, "grad_norm": 0.20344647765159607, "learning_rate": 1.2957479404592826e-05, "loss": 0.7972, "num_input_tokens_seen": 26875336, "step": 46595 }, { "epoch": 6.940720881739648, "grad_norm": 0.2795390486717224, "learning_rate": 1.2951784643286014e-05, "loss": 0.8044, "num_input_tokens_seen": 26877896, "step": 46600 }, { "epoch": 6.941465594280608, "grad_norm": 0.24995607137680054, "learning_rate": 1.2946090696148606e-05, "loss": 0.7545, "num_input_tokens_seen": 26880872, "step": 46605 }, { "epoch": 6.942210306821567, "grad_norm": 0.19721978902816772, "learning_rate": 1.2940397563565381e-05, "loss": 0.7839, "num_input_tokens_seen": 26883944, "step": 46610 }, { "epoch": 6.942955019362526, "grad_norm": 0.2151990532875061, "learning_rate": 1.2934705245921058e-05, "loss": 0.7521, "num_input_tokens_seen": 26887016, "step": 46615 }, { "epoch": 6.943699731903485, "grad_norm": 0.15496370196342468, "learning_rate": 1.2929013743600316e-05, "loss": 0.7837, "num_input_tokens_seen": 26889992, "step": 46620 }, { "epoch": 6.944444444444445, "grad_norm": 0.19389207661151886, "learning_rate": 1.2923323056987733e-05, "loss": 0.7903, "num_input_tokens_seen": 26892744, "step": 46625 }, { "epoch": 6.945189156985403, "grad_norm": 0.21990595757961273, "learning_rate": 1.2917633186467886e-05, "loss": 0.8195, "num_input_tokens_seen": 26895400, "step": 46630 }, { "epoch": 6.945933869526363, "grad_norm": 0.26500311493873596, "learning_rate": 1.2911944132425261e-05, "loss": 0.7918, "num_input_tokens_seen": 26898152, "step": 46635 }, { "epoch": 6.946678582067322, "grad_norm": 0.2272896021604538, "learning_rate": 1.2906255895244301e-05, "loss": 0.82, "num_input_tokens_seen": 26900840, "step": 46640 }, { "epoch": 6.9474232946082815, "grad_norm": 0.18795670568943024, "learning_rate": 1.29005684753094e-05, "loss": 0.7717, "num_input_tokens_seen": 26903688, "step": 46645 }, { "epoch": 6.94816800714924, "grad_norm": 0.1915806084871292, "learning_rate": 1.2894881873004889e-05, "loss": 0.8119, "num_input_tokens_seen": 26906760, "step": 46650 }, { "epoch": 6.9489127196902, "grad_norm": 0.2704862058162689, "learning_rate": 1.288919608871505e-05, "loss": 0.7788, "num_input_tokens_seen": 26909512, "step": 46655 }, { "epoch": 6.949657432231159, "grad_norm": 0.21012473106384277, "learning_rate": 1.2883511122824093e-05, "loss": 0.7784, "num_input_tokens_seen": 26912296, "step": 46660 }, { "epoch": 6.950402144772118, "grad_norm": 0.3674051761627197, "learning_rate": 1.2877826975716195e-05, "loss": 0.8002, "num_input_tokens_seen": 26914888, "step": 46665 }, { "epoch": 6.951146857313077, "grad_norm": 0.27506962418556213, "learning_rate": 1.2872143647775476e-05, "loss": 0.7874, "num_input_tokens_seen": 26918120, "step": 46670 }, { "epoch": 6.951891569854037, "grad_norm": 0.194946750998497, "learning_rate": 1.2866461139385966e-05, "loss": 0.8308, "num_input_tokens_seen": 26920968, "step": 46675 }, { "epoch": 6.952636282394995, "grad_norm": 0.28034549951553345, "learning_rate": 1.2860779450931684e-05, "loss": 0.7995, "num_input_tokens_seen": 26924072, "step": 46680 }, { "epoch": 6.953380994935955, "grad_norm": 0.16217291355133057, "learning_rate": 1.285509858279657e-05, "loss": 0.7977, "num_input_tokens_seen": 26926920, "step": 46685 }, { "epoch": 6.954125707476914, "grad_norm": 0.21364207565784454, "learning_rate": 1.2849418535364527e-05, "loss": 0.8245, "num_input_tokens_seen": 26929640, "step": 46690 }, { "epoch": 6.9548704200178735, "grad_norm": 0.18862858414649963, "learning_rate": 1.284373930901937e-05, "loss": 0.7904, "num_input_tokens_seen": 26932424, "step": 46695 }, { "epoch": 6.955615132558832, "grad_norm": 0.16307014226913452, "learning_rate": 1.2838060904144888e-05, "loss": 0.7963, "num_input_tokens_seen": 26935272, "step": 46700 }, { "epoch": 6.956359845099792, "grad_norm": 0.2670084536075592, "learning_rate": 1.28323833211248e-05, "loss": 0.8202, "num_input_tokens_seen": 26938216, "step": 46705 }, { "epoch": 6.957104557640751, "grad_norm": 0.15860576927661896, "learning_rate": 1.2826706560342788e-05, "loss": 0.7918, "num_input_tokens_seen": 26941192, "step": 46710 }, { "epoch": 6.957849270181709, "grad_norm": 0.2095826268196106, "learning_rate": 1.2821030622182444e-05, "loss": 0.8149, "num_input_tokens_seen": 26944040, "step": 46715 }, { "epoch": 6.958593982722669, "grad_norm": 0.2362523376941681, "learning_rate": 1.2815355507027344e-05, "loss": 0.7968, "num_input_tokens_seen": 26946536, "step": 46720 }, { "epoch": 6.959338695263629, "grad_norm": 0.1840384304523468, "learning_rate": 1.280968121526097e-05, "loss": 0.7846, "num_input_tokens_seen": 26949320, "step": 46725 }, { "epoch": 6.9600834078045875, "grad_norm": 0.24479644000530243, "learning_rate": 1.2804007747266778e-05, "loss": 0.8154, "num_input_tokens_seen": 26952264, "step": 46730 }, { "epoch": 6.960828120345546, "grad_norm": 0.19607847929000854, "learning_rate": 1.2798335103428157e-05, "loss": 0.7918, "num_input_tokens_seen": 26955080, "step": 46735 }, { "epoch": 6.961572832886506, "grad_norm": 0.2512492537498474, "learning_rate": 1.2792663284128443e-05, "loss": 0.7964, "num_input_tokens_seen": 26957768, "step": 46740 }, { "epoch": 6.962317545427465, "grad_norm": 0.18649844825267792, "learning_rate": 1.2786992289750909e-05, "loss": 0.8202, "num_input_tokens_seen": 26960936, "step": 46745 }, { "epoch": 6.963062257968424, "grad_norm": 0.23079855740070343, "learning_rate": 1.2781322120678796e-05, "loss": 0.7704, "num_input_tokens_seen": 26963944, "step": 46750 }, { "epoch": 6.963806970509383, "grad_norm": 0.23621463775634766, "learning_rate": 1.2775652777295252e-05, "loss": 0.7773, "num_input_tokens_seen": 26966920, "step": 46755 }, { "epoch": 6.964551683050343, "grad_norm": 0.22756077349185944, "learning_rate": 1.2769984259983386e-05, "loss": 0.8267, "num_input_tokens_seen": 26970088, "step": 46760 }, { "epoch": 6.965296395591301, "grad_norm": 0.1281815469264984, "learning_rate": 1.2764316569126258e-05, "loss": 0.7665, "num_input_tokens_seen": 26972968, "step": 46765 }, { "epoch": 6.966041108132261, "grad_norm": 0.23295362293720245, "learning_rate": 1.275864970510687e-05, "loss": 0.8316, "num_input_tokens_seen": 26975752, "step": 46770 }, { "epoch": 6.96678582067322, "grad_norm": 0.1861269176006317, "learning_rate": 1.2752983668308167e-05, "loss": 0.7702, "num_input_tokens_seen": 26978856, "step": 46775 }, { "epoch": 6.9675305332141795, "grad_norm": 0.24461700022220612, "learning_rate": 1.2747318459113033e-05, "loss": 0.7927, "num_input_tokens_seen": 26981608, "step": 46780 }, { "epoch": 6.968275245755138, "grad_norm": 0.2697574496269226, "learning_rate": 1.2741654077904313e-05, "loss": 0.7677, "num_input_tokens_seen": 26984456, "step": 46785 }, { "epoch": 6.969019958296098, "grad_norm": 0.1957613229751587, "learning_rate": 1.273599052506476e-05, "loss": 0.8063, "num_input_tokens_seen": 26987208, "step": 46790 }, { "epoch": 6.969764670837057, "grad_norm": 0.29704421758651733, "learning_rate": 1.2730327800977116e-05, "loss": 0.7763, "num_input_tokens_seen": 26990184, "step": 46795 }, { "epoch": 6.970509383378016, "grad_norm": 0.16685239970684052, "learning_rate": 1.2724665906024025e-05, "loss": 0.8238, "num_input_tokens_seen": 26992872, "step": 46800 }, { "epoch": 6.971254095918975, "grad_norm": 0.20118214190006256, "learning_rate": 1.2719004840588106e-05, "loss": 0.8339, "num_input_tokens_seen": 26995624, "step": 46805 }, { "epoch": 6.971998808459935, "grad_norm": 0.19470590353012085, "learning_rate": 1.2713344605051905e-05, "loss": 0.8497, "num_input_tokens_seen": 26998760, "step": 46810 }, { "epoch": 6.9727435210008935, "grad_norm": 0.19980554282665253, "learning_rate": 1.2707685199797926e-05, "loss": 0.7898, "num_input_tokens_seen": 27001448, "step": 46815 }, { "epoch": 6.973488233541853, "grad_norm": 0.24842248857021332, "learning_rate": 1.270202662520861e-05, "loss": 0.8249, "num_input_tokens_seen": 27004264, "step": 46820 }, { "epoch": 6.974232946082812, "grad_norm": 0.23133619129657745, "learning_rate": 1.2696368881666325e-05, "loss": 0.8093, "num_input_tokens_seen": 27007176, "step": 46825 }, { "epoch": 6.9749776586237715, "grad_norm": 0.1980174481868744, "learning_rate": 1.2690711969553412e-05, "loss": 0.8355, "num_input_tokens_seen": 27010088, "step": 46830 }, { "epoch": 6.97572237116473, "grad_norm": 0.21978697180747986, "learning_rate": 1.2685055889252146e-05, "loss": 0.8091, "num_input_tokens_seen": 27012776, "step": 46835 }, { "epoch": 6.97646708370569, "grad_norm": 0.16147273778915405, "learning_rate": 1.2679400641144723e-05, "loss": 0.7824, "num_input_tokens_seen": 27015688, "step": 46840 }, { "epoch": 6.977211796246649, "grad_norm": 0.20513278245925903, "learning_rate": 1.2673746225613315e-05, "loss": 0.7923, "num_input_tokens_seen": 27018568, "step": 46845 }, { "epoch": 6.977956508787608, "grad_norm": 0.2097245454788208, "learning_rate": 1.2668092643040018e-05, "loss": 0.8049, "num_input_tokens_seen": 27021288, "step": 46850 }, { "epoch": 6.978701221328567, "grad_norm": 0.31384506821632385, "learning_rate": 1.2662439893806899e-05, "loss": 0.8504, "num_input_tokens_seen": 27024200, "step": 46855 }, { "epoch": 6.979445933869527, "grad_norm": 0.17347560822963715, "learning_rate": 1.2656787978295913e-05, "loss": 0.8141, "num_input_tokens_seen": 27027048, "step": 46860 }, { "epoch": 6.9801906464104855, "grad_norm": 0.25726819038391113, "learning_rate": 1.265113689688902e-05, "loss": 0.7934, "num_input_tokens_seen": 27029896, "step": 46865 }, { "epoch": 6.980935358951445, "grad_norm": 0.1740093231201172, "learning_rate": 1.2645486649968085e-05, "loss": 0.7798, "num_input_tokens_seen": 27032616, "step": 46870 }, { "epoch": 6.981680071492404, "grad_norm": 0.21502599120140076, "learning_rate": 1.2639837237914943e-05, "loss": 0.8122, "num_input_tokens_seen": 27035304, "step": 46875 }, { "epoch": 6.982424784033363, "grad_norm": 0.20291605591773987, "learning_rate": 1.2634188661111335e-05, "loss": 0.7759, "num_input_tokens_seen": 27038184, "step": 46880 }, { "epoch": 6.983169496574322, "grad_norm": 0.20110447704792023, "learning_rate": 1.2628540919938991e-05, "loss": 0.8157, "num_input_tokens_seen": 27040904, "step": 46885 }, { "epoch": 6.983914209115282, "grad_norm": 0.26006239652633667, "learning_rate": 1.2622894014779547e-05, "loss": 0.8047, "num_input_tokens_seen": 27043720, "step": 46890 }, { "epoch": 6.984658921656241, "grad_norm": 0.17234471440315247, "learning_rate": 1.2617247946014604e-05, "loss": 0.7985, "num_input_tokens_seen": 27046984, "step": 46895 }, { "epoch": 6.9854036341971995, "grad_norm": 0.1933818757534027, "learning_rate": 1.2611602714025696e-05, "loss": 0.8011, "num_input_tokens_seen": 27049800, "step": 46900 }, { "epoch": 6.986148346738159, "grad_norm": 0.18317867815494537, "learning_rate": 1.2605958319194311e-05, "loss": 0.7832, "num_input_tokens_seen": 27053064, "step": 46905 }, { "epoch": 6.986893059279119, "grad_norm": 0.16515859961509705, "learning_rate": 1.2600314761901874e-05, "loss": 0.8265, "num_input_tokens_seen": 27055816, "step": 46910 }, { "epoch": 6.9876377718200775, "grad_norm": 0.22241553664207458, "learning_rate": 1.259467204252976e-05, "loss": 0.7796, "num_input_tokens_seen": 27058632, "step": 46915 }, { "epoch": 6.988382484361036, "grad_norm": 0.12541136145591736, "learning_rate": 1.2589030161459275e-05, "loss": 0.8312, "num_input_tokens_seen": 27061480, "step": 46920 }, { "epoch": 6.989127196901996, "grad_norm": 0.25527533888816833, "learning_rate": 1.2583389119071659e-05, "loss": 0.8113, "num_input_tokens_seen": 27064328, "step": 46925 }, { "epoch": 6.989871909442955, "grad_norm": 0.20390719175338745, "learning_rate": 1.2577748915748127e-05, "loss": 0.8195, "num_input_tokens_seen": 27067336, "step": 46930 }, { "epoch": 6.990616621983914, "grad_norm": 0.18799428641796112, "learning_rate": 1.2572109551869815e-05, "loss": 0.8071, "num_input_tokens_seen": 27070120, "step": 46935 }, { "epoch": 6.991361334524873, "grad_norm": 0.2527753710746765, "learning_rate": 1.2566471027817817e-05, "loss": 0.785, "num_input_tokens_seen": 27073064, "step": 46940 }, { "epoch": 6.992106047065833, "grad_norm": 0.2108406275510788, "learning_rate": 1.256083334397315e-05, "loss": 0.776, "num_input_tokens_seen": 27075784, "step": 46945 }, { "epoch": 6.9928507596067915, "grad_norm": 0.15564881265163422, "learning_rate": 1.2555196500716803e-05, "loss": 0.7823, "num_input_tokens_seen": 27078856, "step": 46950 }, { "epoch": 6.993595472147751, "grad_norm": 0.23227746784687042, "learning_rate": 1.2549560498429683e-05, "loss": 0.8015, "num_input_tokens_seen": 27081992, "step": 46955 }, { "epoch": 6.99434018468871, "grad_norm": 0.2391137033700943, "learning_rate": 1.2543925337492631e-05, "loss": 0.805, "num_input_tokens_seen": 27084840, "step": 46960 }, { "epoch": 6.9950848972296695, "grad_norm": 0.1959073543548584, "learning_rate": 1.2538291018286462e-05, "loss": 0.778, "num_input_tokens_seen": 27087688, "step": 46965 }, { "epoch": 6.995829609770628, "grad_norm": 0.17796280980110168, "learning_rate": 1.2532657541191922e-05, "loss": 0.7746, "num_input_tokens_seen": 27090760, "step": 46970 }, { "epoch": 6.996574322311588, "grad_norm": 0.16761696338653564, "learning_rate": 1.2527024906589698e-05, "loss": 0.7642, "num_input_tokens_seen": 27093640, "step": 46975 }, { "epoch": 6.997319034852547, "grad_norm": 0.18474900722503662, "learning_rate": 1.252139311486042e-05, "loss": 0.8324, "num_input_tokens_seen": 27096840, "step": 46980 }, { "epoch": 6.998063747393506, "grad_norm": 0.15242096781730652, "learning_rate": 1.2515762166384668e-05, "loss": 0.823, "num_input_tokens_seen": 27099976, "step": 46985 }, { "epoch": 6.998808459934465, "grad_norm": 0.3560231328010559, "learning_rate": 1.2510132061542939e-05, "loss": 0.8086, "num_input_tokens_seen": 27103112, "step": 46990 }, { "epoch": 6.999553172475425, "grad_norm": 0.24083895981311798, "learning_rate": 1.2504502800715723e-05, "loss": 0.8272, "num_input_tokens_seen": 27106056, "step": 46995 }, { "epoch": 7.0, "eval_loss": 0.8034152388572693, "eval_runtime": 45.3109, "eval_samples_per_second": 65.856, "eval_steps_per_second": 16.464, "num_input_tokens_seen": 27107328, "step": 46998 }, { "epoch": 7.0002978850163835, "grad_norm": 0.18052858114242554, "learning_rate": 1.2498874384283389e-05, "loss": 0.8305, "num_input_tokens_seen": 27108544, "step": 47000 }, { "epoch": 7.001042597557343, "grad_norm": 0.30253443121910095, "learning_rate": 1.24932468126263e-05, "loss": 0.8223, "num_input_tokens_seen": 27111360, "step": 47005 }, { "epoch": 7.001787310098302, "grad_norm": 0.22628748416900635, "learning_rate": 1.248762008612474e-05, "loss": 0.8279, "num_input_tokens_seen": 27114080, "step": 47010 }, { "epoch": 7.0025320226392616, "grad_norm": 0.2673136293888092, "learning_rate": 1.2481994205158948e-05, "loss": 0.8045, "num_input_tokens_seen": 27117216, "step": 47015 }, { "epoch": 7.00327673518022, "grad_norm": 0.20040330290794373, "learning_rate": 1.2476369170109098e-05, "loss": 0.8075, "num_input_tokens_seen": 27119840, "step": 47020 }, { "epoch": 7.00402144772118, "grad_norm": 0.33653730154037476, "learning_rate": 1.2470744981355296e-05, "loss": 0.8088, "num_input_tokens_seen": 27122528, "step": 47025 }, { "epoch": 7.004766160262139, "grad_norm": 0.2032952755689621, "learning_rate": 1.2465121639277605e-05, "loss": 0.8061, "num_input_tokens_seen": 27125472, "step": 47030 }, { "epoch": 7.005510872803098, "grad_norm": 0.20927315950393677, "learning_rate": 1.2459499144256042e-05, "loss": 0.7683, "num_input_tokens_seen": 27128704, "step": 47035 }, { "epoch": 7.006255585344057, "grad_norm": 0.19854426383972168, "learning_rate": 1.2453877496670532e-05, "loss": 0.8038, "num_input_tokens_seen": 27131712, "step": 47040 }, { "epoch": 7.007000297885017, "grad_norm": 0.3067030608654022, "learning_rate": 1.244825669690097e-05, "loss": 0.7995, "num_input_tokens_seen": 27134688, "step": 47045 }, { "epoch": 7.0077450104259755, "grad_norm": 0.23130470514297485, "learning_rate": 1.24426367453272e-05, "loss": 0.8076, "num_input_tokens_seen": 27137632, "step": 47050 }, { "epoch": 7.008489722966935, "grad_norm": 0.17667220532894135, "learning_rate": 1.2437017642328971e-05, "loss": 0.8294, "num_input_tokens_seen": 27140448, "step": 47055 }, { "epoch": 7.009234435507894, "grad_norm": 0.26449841260910034, "learning_rate": 1.2431399388286017e-05, "loss": 0.7991, "num_input_tokens_seen": 27143328, "step": 47060 }, { "epoch": 7.009979148048854, "grad_norm": 0.19384345412254333, "learning_rate": 1.242578198357799e-05, "loss": 0.789, "num_input_tokens_seen": 27146144, "step": 47065 }, { "epoch": 7.010723860589812, "grad_norm": 0.20868554711341858, "learning_rate": 1.2420165428584493e-05, "loss": 0.8204, "num_input_tokens_seen": 27148864, "step": 47070 }, { "epoch": 7.011468573130771, "grad_norm": 0.3024117946624756, "learning_rate": 1.2414549723685082e-05, "loss": 0.8426, "num_input_tokens_seen": 27152288, "step": 47075 }, { "epoch": 7.012213285671731, "grad_norm": 0.2824068069458008, "learning_rate": 1.240893486925922e-05, "loss": 0.8128, "num_input_tokens_seen": 27155264, "step": 47080 }, { "epoch": 7.0129579982126895, "grad_norm": 0.13355900347232819, "learning_rate": 1.2403320865686361e-05, "loss": 0.8143, "num_input_tokens_seen": 27157920, "step": 47085 }, { "epoch": 7.013702710753649, "grad_norm": 0.24590575695037842, "learning_rate": 1.239770771334585e-05, "loss": 0.8151, "num_input_tokens_seen": 27161216, "step": 47090 }, { "epoch": 7.014447423294608, "grad_norm": 0.34092235565185547, "learning_rate": 1.2392095412617017e-05, "loss": 0.8299, "num_input_tokens_seen": 27164256, "step": 47095 }, { "epoch": 7.0151921358355676, "grad_norm": 0.18189142644405365, "learning_rate": 1.2386483963879114e-05, "loss": 0.7732, "num_input_tokens_seen": 27167168, "step": 47100 }, { "epoch": 7.015936848376526, "grad_norm": 0.163546621799469, "learning_rate": 1.2380873367511344e-05, "loss": 0.8054, "num_input_tokens_seen": 27169824, "step": 47105 }, { "epoch": 7.016681560917486, "grad_norm": 0.14269429445266724, "learning_rate": 1.2375263623892846e-05, "loss": 0.7745, "num_input_tokens_seen": 27172544, "step": 47110 }, { "epoch": 7.017426273458445, "grad_norm": 0.2938309907913208, "learning_rate": 1.2369654733402714e-05, "loss": 0.7941, "num_input_tokens_seen": 27175712, "step": 47115 }, { "epoch": 7.018170985999404, "grad_norm": 0.23401351273059845, "learning_rate": 1.2364046696419962e-05, "loss": 0.7964, "num_input_tokens_seen": 27178368, "step": 47120 }, { "epoch": 7.018915698540363, "grad_norm": 0.15378007292747498, "learning_rate": 1.235843951332355e-05, "loss": 0.7976, "num_input_tokens_seen": 27181120, "step": 47125 }, { "epoch": 7.019660411081323, "grad_norm": 0.22997158765792847, "learning_rate": 1.2352833184492402e-05, "loss": 0.7878, "num_input_tokens_seen": 27183744, "step": 47130 }, { "epoch": 7.0204051236222815, "grad_norm": 0.2477448731660843, "learning_rate": 1.2347227710305365e-05, "loss": 0.8067, "num_input_tokens_seen": 27186784, "step": 47135 }, { "epoch": 7.021149836163241, "grad_norm": 0.16251689195632935, "learning_rate": 1.2341623091141238e-05, "loss": 0.7942, "num_input_tokens_seen": 27189600, "step": 47140 }, { "epoch": 7.0218945487042, "grad_norm": 0.1849970817565918, "learning_rate": 1.2336019327378756e-05, "loss": 0.7844, "num_input_tokens_seen": 27192480, "step": 47145 }, { "epoch": 7.02263926124516, "grad_norm": 0.16879326105117798, "learning_rate": 1.2330416419396612e-05, "loss": 0.7927, "num_input_tokens_seen": 27195296, "step": 47150 }, { "epoch": 7.023383973786118, "grad_norm": 0.255049467086792, "learning_rate": 1.2324814367573411e-05, "loss": 0.7661, "num_input_tokens_seen": 27198592, "step": 47155 }, { "epoch": 7.024128686327078, "grad_norm": 0.25264662504196167, "learning_rate": 1.2319213172287716e-05, "loss": 0.8169, "num_input_tokens_seen": 27201824, "step": 47160 }, { "epoch": 7.024873398868037, "grad_norm": 0.16451279819011688, "learning_rate": 1.231361283391804e-05, "loss": 0.8117, "num_input_tokens_seen": 27204576, "step": 47165 }, { "epoch": 7.025618111408996, "grad_norm": 0.2300158143043518, "learning_rate": 1.2308013352842826e-05, "loss": 0.797, "num_input_tokens_seen": 27207488, "step": 47170 }, { "epoch": 7.026362823949955, "grad_norm": 0.15586310625076294, "learning_rate": 1.230241472944047e-05, "loss": 0.7916, "num_input_tokens_seen": 27210336, "step": 47175 }, { "epoch": 7.027107536490915, "grad_norm": 0.14862754940986633, "learning_rate": 1.22968169640893e-05, "loss": 0.8273, "num_input_tokens_seen": 27213536, "step": 47180 }, { "epoch": 7.0278522490318736, "grad_norm": 0.29012787342071533, "learning_rate": 1.2291220057167602e-05, "loss": 0.791, "num_input_tokens_seen": 27216288, "step": 47185 }, { "epoch": 7.028596961572833, "grad_norm": 0.3126191198825836, "learning_rate": 1.2285624009053573e-05, "loss": 0.7982, "num_input_tokens_seen": 27219072, "step": 47190 }, { "epoch": 7.029341674113792, "grad_norm": 0.15115384757518768, "learning_rate": 1.2280028820125391e-05, "loss": 0.798, "num_input_tokens_seen": 27222016, "step": 47195 }, { "epoch": 7.030086386654752, "grad_norm": 0.23821119964122772, "learning_rate": 1.2274434490761135e-05, "loss": 0.7921, "num_input_tokens_seen": 27224960, "step": 47200 }, { "epoch": 7.03083109919571, "grad_norm": 0.19254061579704285, "learning_rate": 1.226884102133886e-05, "loss": 0.7594, "num_input_tokens_seen": 27228000, "step": 47205 }, { "epoch": 7.03157581173667, "grad_norm": 0.27097588777542114, "learning_rate": 1.2263248412236547e-05, "loss": 0.7944, "num_input_tokens_seen": 27230688, "step": 47210 }, { "epoch": 7.032320524277629, "grad_norm": 0.20360951125621796, "learning_rate": 1.2257656663832129e-05, "loss": 0.797, "num_input_tokens_seen": 27233152, "step": 47215 }, { "epoch": 7.033065236818588, "grad_norm": 0.228648841381073, "learning_rate": 1.225206577650346e-05, "loss": 0.7643, "num_input_tokens_seen": 27236576, "step": 47220 }, { "epoch": 7.033809949359547, "grad_norm": 0.22481216490268707, "learning_rate": 1.2246475750628355e-05, "loss": 0.8103, "num_input_tokens_seen": 27239232, "step": 47225 }, { "epoch": 7.034554661900507, "grad_norm": 0.33280062675476074, "learning_rate": 1.2240886586584568e-05, "loss": 0.8311, "num_input_tokens_seen": 27242048, "step": 47230 }, { "epoch": 7.035299374441466, "grad_norm": 0.23253463208675385, "learning_rate": 1.2235298284749803e-05, "loss": 0.8114, "num_input_tokens_seen": 27244800, "step": 47235 }, { "epoch": 7.036044086982425, "grad_norm": 0.21770383417606354, "learning_rate": 1.2229710845501669e-05, "loss": 0.805, "num_input_tokens_seen": 27247584, "step": 47240 }, { "epoch": 7.036788799523384, "grad_norm": 0.20584622025489807, "learning_rate": 1.2224124269217756e-05, "loss": 0.7892, "num_input_tokens_seen": 27250464, "step": 47245 }, { "epoch": 7.037533512064343, "grad_norm": 0.20911137759685516, "learning_rate": 1.2218538556275594e-05, "loss": 0.786, "num_input_tokens_seen": 27253216, "step": 47250 }, { "epoch": 7.038278224605302, "grad_norm": 0.2141890823841095, "learning_rate": 1.2212953707052619e-05, "loss": 0.8088, "num_input_tokens_seen": 27256064, "step": 47255 }, { "epoch": 7.039022937146261, "grad_norm": 0.1958814561367035, "learning_rate": 1.2207369721926243e-05, "loss": 0.7728, "num_input_tokens_seen": 27259040, "step": 47260 }, { "epoch": 7.039767649687221, "grad_norm": 0.1698477566242218, "learning_rate": 1.2201786601273813e-05, "loss": 0.7885, "num_input_tokens_seen": 27261888, "step": 47265 }, { "epoch": 7.0405123622281796, "grad_norm": 0.1970488727092743, "learning_rate": 1.2196204345472609e-05, "loss": 0.7928, "num_input_tokens_seen": 27264544, "step": 47270 }, { "epoch": 7.041257074769139, "grad_norm": 0.35972529649734497, "learning_rate": 1.2190622954899869e-05, "loss": 0.8255, "num_input_tokens_seen": 27267392, "step": 47275 }, { "epoch": 7.042001787310098, "grad_norm": 0.18772415816783905, "learning_rate": 1.218504242993274e-05, "loss": 0.7886, "num_input_tokens_seen": 27270112, "step": 47280 }, { "epoch": 7.042746499851058, "grad_norm": 0.18066851794719696, "learning_rate": 1.2179462770948355e-05, "loss": 0.8201, "num_input_tokens_seen": 27272800, "step": 47285 }, { "epoch": 7.043491212392016, "grad_norm": 0.17232868075370789, "learning_rate": 1.2173883978323739e-05, "loss": 0.8115, "num_input_tokens_seen": 27275424, "step": 47290 }, { "epoch": 7.044235924932976, "grad_norm": 0.1944294571876526, "learning_rate": 1.2168306052435896e-05, "loss": 0.7838, "num_input_tokens_seen": 27278592, "step": 47295 }, { "epoch": 7.044980637473935, "grad_norm": 0.2294062077999115, "learning_rate": 1.216272899366176e-05, "loss": 0.7929, "num_input_tokens_seen": 27281984, "step": 47300 }, { "epoch": 7.045725350014894, "grad_norm": 0.24341510236263275, "learning_rate": 1.2157152802378207e-05, "loss": 0.8211, "num_input_tokens_seen": 27284832, "step": 47305 }, { "epoch": 7.046470062555853, "grad_norm": 0.2519148588180542, "learning_rate": 1.2151577478962054e-05, "loss": 0.7995, "num_input_tokens_seen": 27287808, "step": 47310 }, { "epoch": 7.047214775096813, "grad_norm": 0.18818451464176178, "learning_rate": 1.2146003023790064e-05, "loss": 0.83, "num_input_tokens_seen": 27290592, "step": 47315 }, { "epoch": 7.047959487637772, "grad_norm": 0.2026139795780182, "learning_rate": 1.2140429437238932e-05, "loss": 0.8044, "num_input_tokens_seen": 27293248, "step": 47320 }, { "epoch": 7.048704200178731, "grad_norm": 0.19541379809379578, "learning_rate": 1.213485671968528e-05, "loss": 0.7765, "num_input_tokens_seen": 27296512, "step": 47325 }, { "epoch": 7.04944891271969, "grad_norm": 0.26434624195098877, "learning_rate": 1.2129284871505712e-05, "loss": 0.7879, "num_input_tokens_seen": 27299360, "step": 47330 }, { "epoch": 7.05019362526065, "grad_norm": 0.21006932854652405, "learning_rate": 1.2123713893076741e-05, "loss": 0.7811, "num_input_tokens_seen": 27302336, "step": 47335 }, { "epoch": 7.050938337801608, "grad_norm": 0.1870446801185608, "learning_rate": 1.2118143784774832e-05, "loss": 0.8313, "num_input_tokens_seen": 27305344, "step": 47340 }, { "epoch": 7.051683050342568, "grad_norm": 0.17890840768814087, "learning_rate": 1.2112574546976397e-05, "loss": 0.7992, "num_input_tokens_seen": 27308128, "step": 47345 }, { "epoch": 7.052427762883527, "grad_norm": 0.26138773560523987, "learning_rate": 1.210700618005778e-05, "loss": 0.8142, "num_input_tokens_seen": 27310784, "step": 47350 }, { "epoch": 7.053172475424486, "grad_norm": 0.27364370226860046, "learning_rate": 1.2101438684395264e-05, "loss": 0.7847, "num_input_tokens_seen": 27313536, "step": 47355 }, { "epoch": 7.053917187965445, "grad_norm": 0.15699800848960876, "learning_rate": 1.2095872060365084e-05, "loss": 0.7996, "num_input_tokens_seen": 27316416, "step": 47360 }, { "epoch": 7.054661900506405, "grad_norm": 0.16732285916805267, "learning_rate": 1.20903063083434e-05, "loss": 0.8127, "num_input_tokens_seen": 27319264, "step": 47365 }, { "epoch": 7.055406613047364, "grad_norm": 0.2497609257698059, "learning_rate": 1.2084741428706328e-05, "loss": 0.7819, "num_input_tokens_seen": 27322080, "step": 47370 }, { "epoch": 7.056151325588323, "grad_norm": 0.23193195462226868, "learning_rate": 1.207917742182992e-05, "loss": 0.7596, "num_input_tokens_seen": 27325056, "step": 47375 }, { "epoch": 7.056896038129282, "grad_norm": 0.18564002215862274, "learning_rate": 1.2073614288090179e-05, "loss": 0.8181, "num_input_tokens_seen": 27327840, "step": 47380 }, { "epoch": 7.057640750670242, "grad_norm": 0.2010033130645752, "learning_rate": 1.206805202786302e-05, "loss": 0.8181, "num_input_tokens_seen": 27330720, "step": 47385 }, { "epoch": 7.0583854632112, "grad_norm": 0.2920725643634796, "learning_rate": 1.2062490641524327e-05, "loss": 0.7851, "num_input_tokens_seen": 27333536, "step": 47390 }, { "epoch": 7.05913017575216, "grad_norm": 0.18170788884162903, "learning_rate": 1.2056930129449918e-05, "loss": 0.8002, "num_input_tokens_seen": 27336416, "step": 47395 }, { "epoch": 7.059874888293119, "grad_norm": 0.2248106747865677, "learning_rate": 1.2051370492015556e-05, "loss": 0.8344, "num_input_tokens_seen": 27339040, "step": 47400 }, { "epoch": 7.0606196008340785, "grad_norm": 0.3030628561973572, "learning_rate": 1.2045811729596922e-05, "loss": 0.8061, "num_input_tokens_seen": 27342016, "step": 47405 }, { "epoch": 7.061364313375037, "grad_norm": 0.22293859720230103, "learning_rate": 1.2040253842569665e-05, "loss": 0.7829, "num_input_tokens_seen": 27344768, "step": 47410 }, { "epoch": 7.062109025915996, "grad_norm": 0.19021281599998474, "learning_rate": 1.2034696831309369e-05, "loss": 0.7854, "num_input_tokens_seen": 27347776, "step": 47415 }, { "epoch": 7.062853738456956, "grad_norm": 0.17765626311302185, "learning_rate": 1.202914069619154e-05, "loss": 0.8015, "num_input_tokens_seen": 27350752, "step": 47420 }, { "epoch": 7.063598450997914, "grad_norm": 0.21818208694458008, "learning_rate": 1.202358543759165e-05, "loss": 0.8326, "num_input_tokens_seen": 27353504, "step": 47425 }, { "epoch": 7.064343163538874, "grad_norm": 0.21144899725914001, "learning_rate": 1.2018031055885093e-05, "loss": 0.7953, "num_input_tokens_seen": 27356448, "step": 47430 }, { "epoch": 7.065087876079833, "grad_norm": 0.17802952229976654, "learning_rate": 1.2012477551447222e-05, "loss": 0.7739, "num_input_tokens_seen": 27359040, "step": 47435 }, { "epoch": 7.065832588620792, "grad_norm": 0.35456734895706177, "learning_rate": 1.2006924924653318e-05, "loss": 0.7995, "num_input_tokens_seen": 27361824, "step": 47440 }, { "epoch": 7.066577301161751, "grad_norm": 0.3097161650657654, "learning_rate": 1.2001373175878597e-05, "loss": 0.8131, "num_input_tokens_seen": 27364960, "step": 47445 }, { "epoch": 7.067322013702711, "grad_norm": 0.2597571313381195, "learning_rate": 1.1995822305498233e-05, "loss": 0.8325, "num_input_tokens_seen": 27367840, "step": 47450 }, { "epoch": 7.06806672624367, "grad_norm": 0.27311038970947266, "learning_rate": 1.1990272313887321e-05, "loss": 0.7985, "num_input_tokens_seen": 27370368, "step": 47455 }, { "epoch": 7.068811438784629, "grad_norm": 0.29281526803970337, "learning_rate": 1.1984723201420911e-05, "loss": 0.816, "num_input_tokens_seen": 27373216, "step": 47460 }, { "epoch": 7.069556151325588, "grad_norm": 0.20575816929340363, "learning_rate": 1.1979174968473991e-05, "loss": 0.8066, "num_input_tokens_seen": 27376160, "step": 47465 }, { "epoch": 7.070300863866548, "grad_norm": 0.21023207902908325, "learning_rate": 1.1973627615421487e-05, "loss": 0.7995, "num_input_tokens_seen": 27379424, "step": 47470 }, { "epoch": 7.071045576407506, "grad_norm": 0.2139250785112381, "learning_rate": 1.1968081142638268e-05, "loss": 0.7839, "num_input_tokens_seen": 27382240, "step": 47475 }, { "epoch": 7.071790288948466, "grad_norm": 0.1669602245092392, "learning_rate": 1.1962535550499152e-05, "loss": 0.7762, "num_input_tokens_seen": 27385312, "step": 47480 }, { "epoch": 7.072535001489425, "grad_norm": 0.1827901303768158, "learning_rate": 1.1956990839378877e-05, "loss": 0.7859, "num_input_tokens_seen": 27388064, "step": 47485 }, { "epoch": 7.0732797140303845, "grad_norm": 0.29710036516189575, "learning_rate": 1.1951447009652119e-05, "loss": 0.7707, "num_input_tokens_seen": 27391136, "step": 47490 }, { "epoch": 7.074024426571343, "grad_norm": 0.2745151221752167, "learning_rate": 1.1945904061693524e-05, "loss": 0.7923, "num_input_tokens_seen": 27393824, "step": 47495 }, { "epoch": 7.074769139112303, "grad_norm": 0.2418980747461319, "learning_rate": 1.1940361995877658e-05, "loss": 0.7919, "num_input_tokens_seen": 27396704, "step": 47500 }, { "epoch": 7.075513851653262, "grad_norm": 0.24351118505001068, "learning_rate": 1.1934820812579031e-05, "loss": 0.7948, "num_input_tokens_seen": 27399616, "step": 47505 }, { "epoch": 7.076258564194221, "grad_norm": 0.22364085912704468, "learning_rate": 1.1929280512172095e-05, "loss": 0.8112, "num_input_tokens_seen": 27402496, "step": 47510 }, { "epoch": 7.07700327673518, "grad_norm": 0.24434544146060944, "learning_rate": 1.1923741095031248e-05, "loss": 0.8274, "num_input_tokens_seen": 27405280, "step": 47515 }, { "epoch": 7.07774798927614, "grad_norm": 0.27666527032852173, "learning_rate": 1.1918202561530813e-05, "loss": 0.8393, "num_input_tokens_seen": 27408320, "step": 47520 }, { "epoch": 7.078492701817098, "grad_norm": 0.1776026487350464, "learning_rate": 1.1912664912045057e-05, "loss": 0.7922, "num_input_tokens_seen": 27411360, "step": 47525 }, { "epoch": 7.079237414358058, "grad_norm": 0.15731684863567352, "learning_rate": 1.1907128146948193e-05, "loss": 0.8069, "num_input_tokens_seen": 27414336, "step": 47530 }, { "epoch": 7.079982126899017, "grad_norm": 0.2130649983882904, "learning_rate": 1.190159226661438e-05, "loss": 0.781, "num_input_tokens_seen": 27417088, "step": 47535 }, { "epoch": 7.0807268394399765, "grad_norm": 0.2301255613565445, "learning_rate": 1.1896057271417707e-05, "loss": 0.8194, "num_input_tokens_seen": 27419840, "step": 47540 }, { "epoch": 7.081471551980935, "grad_norm": 0.17207488417625427, "learning_rate": 1.1890523161732214e-05, "loss": 0.7752, "num_input_tokens_seen": 27422752, "step": 47545 }, { "epoch": 7.082216264521895, "grad_norm": 0.25529879331588745, "learning_rate": 1.188498993793186e-05, "loss": 0.7711, "num_input_tokens_seen": 27425568, "step": 47550 }, { "epoch": 7.082960977062854, "grad_norm": 0.19194191694259644, "learning_rate": 1.187945760039056e-05, "loss": 0.7993, "num_input_tokens_seen": 27428672, "step": 47555 }, { "epoch": 7.083705689603813, "grad_norm": 0.20328663289546967, "learning_rate": 1.1873926149482183e-05, "loss": 0.8048, "num_input_tokens_seen": 27431232, "step": 47560 }, { "epoch": 7.084450402144772, "grad_norm": 0.20608524978160858, "learning_rate": 1.1868395585580503e-05, "loss": 0.7768, "num_input_tokens_seen": 27434144, "step": 47565 }, { "epoch": 7.085195114685732, "grad_norm": 0.3414706885814667, "learning_rate": 1.186286590905926e-05, "loss": 0.8267, "num_input_tokens_seen": 27437408, "step": 47570 }, { "epoch": 7.0859398272266905, "grad_norm": 0.16193868219852448, "learning_rate": 1.1857337120292123e-05, "loss": 0.8136, "num_input_tokens_seen": 27440256, "step": 47575 }, { "epoch": 7.08668453976765, "grad_norm": 0.2314511090517044, "learning_rate": 1.1851809219652721e-05, "loss": 0.7671, "num_input_tokens_seen": 27443008, "step": 47580 }, { "epoch": 7.087429252308609, "grad_norm": 0.21499329805374146, "learning_rate": 1.1846282207514586e-05, "loss": 0.7853, "num_input_tokens_seen": 27445952, "step": 47585 }, { "epoch": 7.088173964849568, "grad_norm": 0.203821063041687, "learning_rate": 1.184075608425122e-05, "loss": 0.7681, "num_input_tokens_seen": 27448864, "step": 47590 }, { "epoch": 7.088918677390527, "grad_norm": 0.21115495264530182, "learning_rate": 1.1835230850236057e-05, "loss": 0.8044, "num_input_tokens_seen": 27451648, "step": 47595 }, { "epoch": 7.089663389931486, "grad_norm": 0.319888710975647, "learning_rate": 1.1829706505842478e-05, "loss": 0.8012, "num_input_tokens_seen": 27455072, "step": 47600 }, { "epoch": 7.090408102472446, "grad_norm": 0.2927168011665344, "learning_rate": 1.1824183051443776e-05, "loss": 0.7964, "num_input_tokens_seen": 27457760, "step": 47605 }, { "epoch": 7.091152815013404, "grad_norm": 0.18559491634368896, "learning_rate": 1.1818660487413217e-05, "loss": 0.7639, "num_input_tokens_seen": 27460480, "step": 47610 }, { "epoch": 7.091897527554364, "grad_norm": 0.15145640075206757, "learning_rate": 1.1813138814123997e-05, "loss": 0.7964, "num_input_tokens_seen": 27463584, "step": 47615 }, { "epoch": 7.092642240095323, "grad_norm": 0.20034267008304596, "learning_rate": 1.1807618031949235e-05, "loss": 0.8178, "num_input_tokens_seen": 27466528, "step": 47620 }, { "epoch": 7.0933869526362825, "grad_norm": 0.2452366203069687, "learning_rate": 1.1802098141262008e-05, "loss": 0.7542, "num_input_tokens_seen": 27469280, "step": 47625 }, { "epoch": 7.094131665177241, "grad_norm": 0.27792835235595703, "learning_rate": 1.1796579142435332e-05, "loss": 0.7787, "num_input_tokens_seen": 27472192, "step": 47630 }, { "epoch": 7.094876377718201, "grad_norm": 0.19903382658958435, "learning_rate": 1.1791061035842158e-05, "loss": 0.8384, "num_input_tokens_seen": 27475136, "step": 47635 }, { "epoch": 7.09562109025916, "grad_norm": 0.23942133784294128, "learning_rate": 1.178554382185538e-05, "loss": 0.8319, "num_input_tokens_seen": 27477984, "step": 47640 }, { "epoch": 7.096365802800119, "grad_norm": 0.16007038950920105, "learning_rate": 1.1780027500847818e-05, "loss": 0.756, "num_input_tokens_seen": 27480864, "step": 47645 }, { "epoch": 7.097110515341078, "grad_norm": 0.212952122092247, "learning_rate": 1.177451207319226e-05, "loss": 0.797, "num_input_tokens_seen": 27483744, "step": 47650 }, { "epoch": 7.097855227882038, "grad_norm": 0.18946515023708344, "learning_rate": 1.1768997539261392e-05, "loss": 0.7928, "num_input_tokens_seen": 27486464, "step": 47655 }, { "epoch": 7.0985999404229965, "grad_norm": 0.1933276653289795, "learning_rate": 1.176348389942788e-05, "loss": 0.7941, "num_input_tokens_seen": 27489408, "step": 47660 }, { "epoch": 7.099344652963956, "grad_norm": 0.20337195694446564, "learning_rate": 1.175797115406431e-05, "loss": 0.7876, "num_input_tokens_seen": 27492160, "step": 47665 }, { "epoch": 7.100089365504915, "grad_norm": 0.20357684791088104, "learning_rate": 1.1752459303543209e-05, "loss": 0.8151, "num_input_tokens_seen": 27495296, "step": 47670 }, { "epoch": 7.1008340780458745, "grad_norm": 0.23979899287223816, "learning_rate": 1.174694834823705e-05, "loss": 0.8066, "num_input_tokens_seen": 27498176, "step": 47675 }, { "epoch": 7.101578790586833, "grad_norm": 0.32146745920181274, "learning_rate": 1.1741438288518248e-05, "loss": 0.8324, "num_input_tokens_seen": 27500864, "step": 47680 }, { "epoch": 7.102323503127793, "grad_norm": 0.28807154297828674, "learning_rate": 1.173592912475914e-05, "loss": 0.7751, "num_input_tokens_seen": 27504128, "step": 47685 }, { "epoch": 7.103068215668752, "grad_norm": 0.20279249548912048, "learning_rate": 1.1730420857332002e-05, "loss": 0.8018, "num_input_tokens_seen": 27506912, "step": 47690 }, { "epoch": 7.103812928209711, "grad_norm": 0.2368771880865097, "learning_rate": 1.1724913486609077e-05, "loss": 0.8195, "num_input_tokens_seen": 27509376, "step": 47695 }, { "epoch": 7.10455764075067, "grad_norm": 0.2080375999212265, "learning_rate": 1.1719407012962524e-05, "loss": 0.783, "num_input_tokens_seen": 27512384, "step": 47700 }, { "epoch": 7.10530235329163, "grad_norm": 0.14309662580490112, "learning_rate": 1.1713901436764451e-05, "loss": 0.7811, "num_input_tokens_seen": 27515360, "step": 47705 }, { "epoch": 7.1060470658325885, "grad_norm": 0.2735162675380707, "learning_rate": 1.1708396758386911e-05, "loss": 0.7927, "num_input_tokens_seen": 27518208, "step": 47710 }, { "epoch": 7.106791778373548, "grad_norm": 0.24440784752368927, "learning_rate": 1.1702892978201868e-05, "loss": 0.8493, "num_input_tokens_seen": 27520864, "step": 47715 }, { "epoch": 7.107536490914507, "grad_norm": 0.25739774107933044, "learning_rate": 1.1697390096581265e-05, "loss": 0.8024, "num_input_tokens_seen": 27523744, "step": 47720 }, { "epoch": 7.1082812034554665, "grad_norm": 0.24702437222003937, "learning_rate": 1.1691888113896945e-05, "loss": 0.7986, "num_input_tokens_seen": 27526976, "step": 47725 }, { "epoch": 7.109025915996425, "grad_norm": 0.20460061728954315, "learning_rate": 1.1686387030520721e-05, "loss": 0.7797, "num_input_tokens_seen": 27529664, "step": 47730 }, { "epoch": 7.109770628537385, "grad_norm": 0.18463554978370667, "learning_rate": 1.168088684682433e-05, "loss": 0.7974, "num_input_tokens_seen": 27532544, "step": 47735 }, { "epoch": 7.110515341078344, "grad_norm": 0.19927620887756348, "learning_rate": 1.1675387563179455e-05, "loss": 0.8022, "num_input_tokens_seen": 27535360, "step": 47740 }, { "epoch": 7.111260053619303, "grad_norm": 0.16921353340148926, "learning_rate": 1.1669889179957725e-05, "loss": 0.78, "num_input_tokens_seen": 27538048, "step": 47745 }, { "epoch": 7.112004766160262, "grad_norm": 0.1868216097354889, "learning_rate": 1.1664391697530677e-05, "loss": 0.8207, "num_input_tokens_seen": 27541056, "step": 47750 }, { "epoch": 7.112749478701222, "grad_norm": 0.17684152722358704, "learning_rate": 1.1658895116269821e-05, "loss": 0.8135, "num_input_tokens_seen": 27544000, "step": 47755 }, { "epoch": 7.1134941912421805, "grad_norm": 0.15658219158649445, "learning_rate": 1.16533994365466e-05, "loss": 0.7901, "num_input_tokens_seen": 27546848, "step": 47760 }, { "epoch": 7.114238903783139, "grad_norm": 0.34218069911003113, "learning_rate": 1.1647904658732373e-05, "loss": 0.8412, "num_input_tokens_seen": 27549792, "step": 47765 }, { "epoch": 7.114983616324099, "grad_norm": 0.2433939278125763, "learning_rate": 1.1642410783198465e-05, "loss": 0.8067, "num_input_tokens_seen": 27552800, "step": 47770 }, { "epoch": 7.115728328865058, "grad_norm": 0.32805299758911133, "learning_rate": 1.1636917810316126e-05, "loss": 0.8137, "num_input_tokens_seen": 27555968, "step": 47775 }, { "epoch": 7.116473041406017, "grad_norm": 0.17177598178386688, "learning_rate": 1.1631425740456562e-05, "loss": 0.8036, "num_input_tokens_seen": 27558688, "step": 47780 }, { "epoch": 7.117217753946976, "grad_norm": 0.16109929978847504, "learning_rate": 1.1625934573990882e-05, "loss": 0.8171, "num_input_tokens_seen": 27561888, "step": 47785 }, { "epoch": 7.117962466487936, "grad_norm": 0.1942378282546997, "learning_rate": 1.1620444311290172e-05, "loss": 0.8081, "num_input_tokens_seen": 27564896, "step": 47790 }, { "epoch": 7.1187071790288945, "grad_norm": 0.2920604348182678, "learning_rate": 1.1614954952725434e-05, "loss": 0.8143, "num_input_tokens_seen": 27568000, "step": 47795 }, { "epoch": 7.119451891569854, "grad_norm": 0.26634857058525085, "learning_rate": 1.1609466498667634e-05, "loss": 0.7809, "num_input_tokens_seen": 27570816, "step": 47800 }, { "epoch": 7.120196604110813, "grad_norm": 0.25535258650779724, "learning_rate": 1.1603978949487634e-05, "loss": 0.7646, "num_input_tokens_seen": 27573664, "step": 47805 }, { "epoch": 7.1209413166517725, "grad_norm": 0.20915044844150543, "learning_rate": 1.1598492305556274e-05, "loss": 0.7874, "num_input_tokens_seen": 27576512, "step": 47810 }, { "epoch": 7.121686029192731, "grad_norm": 0.190914586186409, "learning_rate": 1.1593006567244328e-05, "loss": 0.806, "num_input_tokens_seen": 27579424, "step": 47815 }, { "epoch": 7.122430741733691, "grad_norm": 0.2275465726852417, "learning_rate": 1.1587521734922476e-05, "loss": 0.8251, "num_input_tokens_seen": 27582240, "step": 47820 }, { "epoch": 7.12317545427465, "grad_norm": 0.18696534633636475, "learning_rate": 1.1582037808961377e-05, "loss": 0.7924, "num_input_tokens_seen": 27585056, "step": 47825 }, { "epoch": 7.123920166815609, "grad_norm": 0.19051149487495422, "learning_rate": 1.1576554789731608e-05, "loss": 0.7786, "num_input_tokens_seen": 27587744, "step": 47830 }, { "epoch": 7.124664879356568, "grad_norm": 0.27752670645713806, "learning_rate": 1.1571072677603691e-05, "loss": 0.7898, "num_input_tokens_seen": 27590496, "step": 47835 }, { "epoch": 7.125409591897528, "grad_norm": 0.20296308398246765, "learning_rate": 1.1565591472948095e-05, "loss": 0.8017, "num_input_tokens_seen": 27593408, "step": 47840 }, { "epoch": 7.1261543044384865, "grad_norm": 0.18071380257606506, "learning_rate": 1.1560111176135197e-05, "loss": 0.7731, "num_input_tokens_seen": 27596224, "step": 47845 }, { "epoch": 7.126899016979446, "grad_norm": 0.21050286293029785, "learning_rate": 1.1554631787535353e-05, "loss": 0.7919, "num_input_tokens_seen": 27598976, "step": 47850 }, { "epoch": 7.127643729520405, "grad_norm": 0.23381413519382477, "learning_rate": 1.1549153307518817e-05, "loss": 0.8086, "num_input_tokens_seen": 27601920, "step": 47855 }, { "epoch": 7.128388442061365, "grad_norm": 0.2194284200668335, "learning_rate": 1.1543675736455814e-05, "loss": 0.799, "num_input_tokens_seen": 27604512, "step": 47860 }, { "epoch": 7.129133154602323, "grad_norm": 0.19407585263252258, "learning_rate": 1.1538199074716493e-05, "loss": 0.8521, "num_input_tokens_seen": 27607072, "step": 47865 }, { "epoch": 7.129877867143283, "grad_norm": 0.19163183867931366, "learning_rate": 1.1532723322670952e-05, "loss": 0.8155, "num_input_tokens_seen": 27609856, "step": 47870 }, { "epoch": 7.130622579684242, "grad_norm": 0.16890129446983337, "learning_rate": 1.152724848068922e-05, "loss": 0.7861, "num_input_tokens_seen": 27612640, "step": 47875 }, { "epoch": 7.131367292225201, "grad_norm": 0.2295607626438141, "learning_rate": 1.152177454914125e-05, "loss": 0.8302, "num_input_tokens_seen": 27615552, "step": 47880 }, { "epoch": 7.13211200476616, "grad_norm": 0.1557360142469406, "learning_rate": 1.151630152839697e-05, "loss": 0.7831, "num_input_tokens_seen": 27618560, "step": 47885 }, { "epoch": 7.13285671730712, "grad_norm": 0.22055955231189728, "learning_rate": 1.1510829418826199e-05, "loss": 0.8495, "num_input_tokens_seen": 27621600, "step": 47890 }, { "epoch": 7.1336014298480785, "grad_norm": 0.15231645107269287, "learning_rate": 1.1505358220798736e-05, "loss": 0.7909, "num_input_tokens_seen": 27624448, "step": 47895 }, { "epoch": 7.134346142389038, "grad_norm": 0.20096252858638763, "learning_rate": 1.1499887934684297e-05, "loss": 0.7956, "num_input_tokens_seen": 27627104, "step": 47900 }, { "epoch": 7.135090854929997, "grad_norm": 0.161427304148674, "learning_rate": 1.1494418560852546e-05, "loss": 0.816, "num_input_tokens_seen": 27629856, "step": 47905 }, { "epoch": 7.135835567470957, "grad_norm": 0.3752046227455139, "learning_rate": 1.1488950099673087e-05, "loss": 0.8356, "num_input_tokens_seen": 27632896, "step": 47910 }, { "epoch": 7.136580280011915, "grad_norm": 0.13798744976520538, "learning_rate": 1.148348255151544e-05, "loss": 0.7722, "num_input_tokens_seen": 27635488, "step": 47915 }, { "epoch": 7.137324992552875, "grad_norm": 0.20495209097862244, "learning_rate": 1.1478015916749089e-05, "loss": 0.8185, "num_input_tokens_seen": 27638400, "step": 47920 }, { "epoch": 7.138069705093834, "grad_norm": 0.18858718872070312, "learning_rate": 1.147255019574345e-05, "loss": 0.7942, "num_input_tokens_seen": 27641504, "step": 47925 }, { "epoch": 7.1388144176347925, "grad_norm": 0.1921473741531372, "learning_rate": 1.1467085388867866e-05, "loss": 0.7983, "num_input_tokens_seen": 27644480, "step": 47930 }, { "epoch": 7.139559130175752, "grad_norm": 0.24501585960388184, "learning_rate": 1.1461621496491628e-05, "loss": 0.7916, "num_input_tokens_seen": 27647552, "step": 47935 }, { "epoch": 7.140303842716711, "grad_norm": 0.24845241010189056, "learning_rate": 1.1456158518983967e-05, "loss": 0.8064, "num_input_tokens_seen": 27650464, "step": 47940 }, { "epoch": 7.141048555257671, "grad_norm": 0.24716250598430634, "learning_rate": 1.1450696456714057e-05, "loss": 0.8076, "num_input_tokens_seen": 27653376, "step": 47945 }, { "epoch": 7.141793267798629, "grad_norm": 0.20709574222564697, "learning_rate": 1.1445235310050987e-05, "loss": 0.8057, "num_input_tokens_seen": 27656288, "step": 47950 }, { "epoch": 7.142537980339589, "grad_norm": 0.19365793466567993, "learning_rate": 1.14397750793638e-05, "loss": 0.7828, "num_input_tokens_seen": 27658976, "step": 47955 }, { "epoch": 7.143282692880548, "grad_norm": 0.18086545169353485, "learning_rate": 1.1434315765021485e-05, "loss": 0.7656, "num_input_tokens_seen": 27661600, "step": 47960 }, { "epoch": 7.144027405421507, "grad_norm": 0.16913874447345734, "learning_rate": 1.1428857367392964e-05, "loss": 0.8123, "num_input_tokens_seen": 27664224, "step": 47965 }, { "epoch": 7.144772117962466, "grad_norm": 0.2414947897195816, "learning_rate": 1.1423399886847077e-05, "loss": 0.769, "num_input_tokens_seen": 27667040, "step": 47970 }, { "epoch": 7.145516830503426, "grad_norm": 0.20000068843364716, "learning_rate": 1.1417943323752629e-05, "loss": 0.8087, "num_input_tokens_seen": 27669952, "step": 47975 }, { "epoch": 7.1462615430443845, "grad_norm": 0.23980902135372162, "learning_rate": 1.1412487678478357e-05, "loss": 0.8078, "num_input_tokens_seen": 27672704, "step": 47980 }, { "epoch": 7.147006255585344, "grad_norm": 0.14023630321025848, "learning_rate": 1.1407032951392916e-05, "loss": 0.7562, "num_input_tokens_seen": 27675904, "step": 47985 }, { "epoch": 7.147750968126303, "grad_norm": 0.17804117500782013, "learning_rate": 1.1401579142864924e-05, "loss": 0.7839, "num_input_tokens_seen": 27679040, "step": 47990 }, { "epoch": 7.148495680667263, "grad_norm": 0.25838008522987366, "learning_rate": 1.1396126253262926e-05, "loss": 0.8081, "num_input_tokens_seen": 27682112, "step": 47995 }, { "epoch": 7.149240393208221, "grad_norm": 0.24422913789749146, "learning_rate": 1.1390674282955408e-05, "loss": 0.7778, "num_input_tokens_seen": 27684800, "step": 48000 }, { "epoch": 7.149985105749181, "grad_norm": 0.24598270654678345, "learning_rate": 1.1385223232310799e-05, "loss": 0.7756, "num_input_tokens_seen": 27687776, "step": 48005 }, { "epoch": 7.15072981829014, "grad_norm": 0.19913488626480103, "learning_rate": 1.1379773101697439e-05, "loss": 0.7948, "num_input_tokens_seen": 27690656, "step": 48010 }, { "epoch": 7.151474530831099, "grad_norm": 0.2762613892555237, "learning_rate": 1.1374323891483649e-05, "loss": 0.7966, "num_input_tokens_seen": 27693856, "step": 48015 }, { "epoch": 7.152219243372058, "grad_norm": 0.16959254443645477, "learning_rate": 1.136887560203764e-05, "loss": 0.7806, "num_input_tokens_seen": 27696832, "step": 48020 }, { "epoch": 7.152963955913018, "grad_norm": 0.2481464296579361, "learning_rate": 1.13634282337276e-05, "loss": 0.7818, "num_input_tokens_seen": 27699584, "step": 48025 }, { "epoch": 7.153708668453977, "grad_norm": 0.1802099496126175, "learning_rate": 1.1357981786921636e-05, "loss": 0.8005, "num_input_tokens_seen": 27702016, "step": 48030 }, { "epoch": 7.154453380994936, "grad_norm": 0.35848742723464966, "learning_rate": 1.13525362619878e-05, "loss": 0.8473, "num_input_tokens_seen": 27704736, "step": 48035 }, { "epoch": 7.155198093535895, "grad_norm": 0.28644490242004395, "learning_rate": 1.1347091659294087e-05, "loss": 0.831, "num_input_tokens_seen": 27707392, "step": 48040 }, { "epoch": 7.155942806076855, "grad_norm": 0.15442998707294464, "learning_rate": 1.13416479792084e-05, "loss": 0.8454, "num_input_tokens_seen": 27710048, "step": 48045 }, { "epoch": 7.156687518617813, "grad_norm": 0.18521562218666077, "learning_rate": 1.1336205222098622e-05, "loss": 0.8095, "num_input_tokens_seen": 27712992, "step": 48050 }, { "epoch": 7.157432231158773, "grad_norm": 0.2630288600921631, "learning_rate": 1.1330763388332533e-05, "loss": 0.8095, "num_input_tokens_seen": 27715808, "step": 48055 }, { "epoch": 7.158176943699732, "grad_norm": 0.2769261598587036, "learning_rate": 1.1325322478277877e-05, "loss": 0.7791, "num_input_tokens_seen": 27718528, "step": 48060 }, { "epoch": 7.158921656240691, "grad_norm": 0.24953635036945343, "learning_rate": 1.1319882492302333e-05, "loss": 0.7613, "num_input_tokens_seen": 27721568, "step": 48065 }, { "epoch": 7.15966636878165, "grad_norm": 0.24815243482589722, "learning_rate": 1.131444343077351e-05, "loss": 0.7871, "num_input_tokens_seen": 27724416, "step": 48070 }, { "epoch": 7.16041108132261, "grad_norm": 0.21127517521381378, "learning_rate": 1.1309005294058968e-05, "loss": 0.8186, "num_input_tokens_seen": 27727392, "step": 48075 }, { "epoch": 7.161155793863569, "grad_norm": 0.18631857633590698, "learning_rate": 1.1303568082526178e-05, "loss": 0.8008, "num_input_tokens_seen": 27730304, "step": 48080 }, { "epoch": 7.161900506404528, "grad_norm": 0.19344192743301392, "learning_rate": 1.1298131796542576e-05, "loss": 0.7829, "num_input_tokens_seen": 27733056, "step": 48085 }, { "epoch": 7.162645218945487, "grad_norm": 0.1757507026195526, "learning_rate": 1.1292696436475514e-05, "loss": 0.8012, "num_input_tokens_seen": 27735776, "step": 48090 }, { "epoch": 7.163389931486447, "grad_norm": 0.19731725752353668, "learning_rate": 1.1287262002692295e-05, "loss": 0.8122, "num_input_tokens_seen": 27738624, "step": 48095 }, { "epoch": 7.164134644027405, "grad_norm": 0.23937399685382843, "learning_rate": 1.1281828495560157e-05, "loss": 0.7757, "num_input_tokens_seen": 27741632, "step": 48100 }, { "epoch": 7.164879356568365, "grad_norm": 0.21672584116458893, "learning_rate": 1.1276395915446278e-05, "loss": 0.78, "num_input_tokens_seen": 27745120, "step": 48105 }, { "epoch": 7.165624069109324, "grad_norm": 0.20387043058872223, "learning_rate": 1.1270964262717773e-05, "loss": 0.779, "num_input_tokens_seen": 27747872, "step": 48110 }, { "epoch": 7.166368781650283, "grad_norm": 0.19389347732067108, "learning_rate": 1.126553353774168e-05, "loss": 0.7909, "num_input_tokens_seen": 27750464, "step": 48115 }, { "epoch": 7.167113494191242, "grad_norm": 0.21256600320339203, "learning_rate": 1.1260103740884986e-05, "loss": 0.794, "num_input_tokens_seen": 27753344, "step": 48120 }, { "epoch": 7.167858206732201, "grad_norm": 0.1906212866306305, "learning_rate": 1.1254674872514629e-05, "loss": 0.8386, "num_input_tokens_seen": 27756000, "step": 48125 }, { "epoch": 7.168602919273161, "grad_norm": 0.2257893830537796, "learning_rate": 1.124924693299745e-05, "loss": 0.794, "num_input_tokens_seen": 27758912, "step": 48130 }, { "epoch": 7.169347631814119, "grad_norm": 0.14185746014118195, "learning_rate": 1.124381992270026e-05, "loss": 0.7737, "num_input_tokens_seen": 27761664, "step": 48135 }, { "epoch": 7.170092344355079, "grad_norm": 0.2256738841533661, "learning_rate": 1.123839384198979e-05, "loss": 0.7931, "num_input_tokens_seen": 27764384, "step": 48140 }, { "epoch": 7.170837056896038, "grad_norm": 0.25048038363456726, "learning_rate": 1.123296869123272e-05, "loss": 0.7953, "num_input_tokens_seen": 27767488, "step": 48145 }, { "epoch": 7.171581769436997, "grad_norm": 0.26191431283950806, "learning_rate": 1.1227544470795645e-05, "loss": 0.7724, "num_input_tokens_seen": 27770560, "step": 48150 }, { "epoch": 7.172326481977956, "grad_norm": 0.2601478695869446, "learning_rate": 1.122212118104512e-05, "loss": 0.7853, "num_input_tokens_seen": 27773440, "step": 48155 }, { "epoch": 7.173071194518916, "grad_norm": 0.2589012384414673, "learning_rate": 1.1216698822347629e-05, "loss": 0.8266, "num_input_tokens_seen": 27776224, "step": 48160 }, { "epoch": 7.173815907059875, "grad_norm": 0.18441331386566162, "learning_rate": 1.1211277395069603e-05, "loss": 0.7807, "num_input_tokens_seen": 27779104, "step": 48165 }, { "epoch": 7.174560619600834, "grad_norm": 0.22431887686252594, "learning_rate": 1.120585689957738e-05, "loss": 0.8058, "num_input_tokens_seen": 27781792, "step": 48170 }, { "epoch": 7.175305332141793, "grad_norm": 0.18822872638702393, "learning_rate": 1.1200437336237265e-05, "loss": 0.795, "num_input_tokens_seen": 27784576, "step": 48175 }, { "epoch": 7.176050044682753, "grad_norm": 0.2841874361038208, "learning_rate": 1.11950187054155e-05, "loss": 0.803, "num_input_tokens_seen": 27787296, "step": 48180 }, { "epoch": 7.176794757223711, "grad_norm": 0.22145698964595795, "learning_rate": 1.1189601007478233e-05, "loss": 0.7936, "num_input_tokens_seen": 27790080, "step": 48185 }, { "epoch": 7.177539469764671, "grad_norm": 0.17389142513275146, "learning_rate": 1.1184184242791581e-05, "loss": 0.7812, "num_input_tokens_seen": 27793152, "step": 48190 }, { "epoch": 7.17828418230563, "grad_norm": 0.30109110474586487, "learning_rate": 1.1178768411721589e-05, "loss": 0.8676, "num_input_tokens_seen": 27796352, "step": 48195 }, { "epoch": 7.1790288948465895, "grad_norm": 0.16732116043567657, "learning_rate": 1.1173353514634232e-05, "loss": 0.8008, "num_input_tokens_seen": 27799040, "step": 48200 }, { "epoch": 7.179773607387548, "grad_norm": 0.19690589606761932, "learning_rate": 1.116793955189544e-05, "loss": 0.8102, "num_input_tokens_seen": 27801888, "step": 48205 }, { "epoch": 7.180518319928508, "grad_norm": 0.14026054739952087, "learning_rate": 1.1162526523871048e-05, "loss": 0.7855, "num_input_tokens_seen": 27804736, "step": 48210 }, { "epoch": 7.181263032469467, "grad_norm": 0.2091798335313797, "learning_rate": 1.115711443092686e-05, "loss": 0.7704, "num_input_tokens_seen": 27807488, "step": 48215 }, { "epoch": 7.182007745010426, "grad_norm": 0.15737907588481903, "learning_rate": 1.115170327342859e-05, "loss": 0.776, "num_input_tokens_seen": 27810368, "step": 48220 }, { "epoch": 7.182752457551385, "grad_norm": 0.15499338507652283, "learning_rate": 1.1146293051741913e-05, "loss": 0.7645, "num_input_tokens_seen": 27813088, "step": 48225 }, { "epoch": 7.183497170092345, "grad_norm": 0.25002816319465637, "learning_rate": 1.1140883766232422e-05, "loss": 0.8016, "num_input_tokens_seen": 27815840, "step": 48230 }, { "epoch": 7.184241882633303, "grad_norm": 0.3114635944366455, "learning_rate": 1.1135475417265662e-05, "loss": 0.8344, "num_input_tokens_seen": 27818720, "step": 48235 }, { "epoch": 7.184986595174263, "grad_norm": 0.27878889441490173, "learning_rate": 1.113006800520711e-05, "loss": 0.8058, "num_input_tokens_seen": 27821632, "step": 48240 }, { "epoch": 7.185731307715222, "grad_norm": 0.2140994668006897, "learning_rate": 1.1124661530422176e-05, "loss": 0.794, "num_input_tokens_seen": 27824448, "step": 48245 }, { "epoch": 7.1864760202561815, "grad_norm": 0.24386270344257355, "learning_rate": 1.111925599327619e-05, "loss": 0.7853, "num_input_tokens_seen": 27827552, "step": 48250 }, { "epoch": 7.18722073279714, "grad_norm": 0.2927089035511017, "learning_rate": 1.111385139413445e-05, "loss": 0.8263, "num_input_tokens_seen": 27830304, "step": 48255 }, { "epoch": 7.1879654453381, "grad_norm": 0.19259481132030487, "learning_rate": 1.1108447733362177e-05, "loss": 0.7857, "num_input_tokens_seen": 27833280, "step": 48260 }, { "epoch": 7.188710157879059, "grad_norm": 0.1913982331752777, "learning_rate": 1.1103045011324526e-05, "loss": 0.7979, "num_input_tokens_seen": 27836032, "step": 48265 }, { "epoch": 7.189454870420018, "grad_norm": 0.2407289743423462, "learning_rate": 1.1097643228386593e-05, "loss": 0.7937, "num_input_tokens_seen": 27838816, "step": 48270 }, { "epoch": 7.190199582960977, "grad_norm": 0.24271242320537567, "learning_rate": 1.1092242384913415e-05, "loss": 0.8123, "num_input_tokens_seen": 27841536, "step": 48275 }, { "epoch": 7.190944295501936, "grad_norm": 0.20026464760303497, "learning_rate": 1.1086842481269943e-05, "loss": 0.772, "num_input_tokens_seen": 27844320, "step": 48280 }, { "epoch": 7.1916890080428955, "grad_norm": 0.26912590861320496, "learning_rate": 1.10814435178211e-05, "loss": 0.7779, "num_input_tokens_seen": 27847264, "step": 48285 }, { "epoch": 7.192433720583854, "grad_norm": 0.21671070158481598, "learning_rate": 1.1076045494931705e-05, "loss": 0.8257, "num_input_tokens_seen": 27850048, "step": 48290 }, { "epoch": 7.193178433124814, "grad_norm": 0.17297516763210297, "learning_rate": 1.1070648412966548e-05, "loss": 0.7803, "num_input_tokens_seen": 27852736, "step": 48295 }, { "epoch": 7.193923145665773, "grad_norm": 0.5685930848121643, "learning_rate": 1.1065252272290333e-05, "loss": 0.806, "num_input_tokens_seen": 27856064, "step": 48300 }, { "epoch": 7.194667858206732, "grad_norm": 0.27853813767433167, "learning_rate": 1.1059857073267718e-05, "loss": 0.767, "num_input_tokens_seen": 27858720, "step": 48305 }, { "epoch": 7.195412570747691, "grad_norm": 0.1892496794462204, "learning_rate": 1.1054462816263295e-05, "loss": 0.8114, "num_input_tokens_seen": 27861408, "step": 48310 }, { "epoch": 7.196157283288651, "grad_norm": 0.24193929135799408, "learning_rate": 1.1049069501641567e-05, "loss": 0.7873, "num_input_tokens_seen": 27864352, "step": 48315 }, { "epoch": 7.196901995829609, "grad_norm": 0.23331975936889648, "learning_rate": 1.1043677129767002e-05, "loss": 0.8105, "num_input_tokens_seen": 27867264, "step": 48320 }, { "epoch": 7.197646708370569, "grad_norm": 0.17851243913173676, "learning_rate": 1.1038285701004003e-05, "loss": 0.8147, "num_input_tokens_seen": 27869984, "step": 48325 }, { "epoch": 7.198391420911528, "grad_norm": 0.18188853561878204, "learning_rate": 1.1032895215716881e-05, "loss": 0.822, "num_input_tokens_seen": 27872832, "step": 48330 }, { "epoch": 7.1991361334524875, "grad_norm": 0.16568806767463684, "learning_rate": 1.1027505674269916e-05, "loss": 0.8125, "num_input_tokens_seen": 27875744, "step": 48335 }, { "epoch": 7.199880845993446, "grad_norm": 0.20817315578460693, "learning_rate": 1.102211707702731e-05, "loss": 0.815, "num_input_tokens_seen": 27878880, "step": 48340 }, { "epoch": 7.200625558534406, "grad_norm": 0.21068257093429565, "learning_rate": 1.1016729424353212e-05, "loss": 0.794, "num_input_tokens_seen": 27881760, "step": 48345 }, { "epoch": 7.201370271075365, "grad_norm": 0.1767912358045578, "learning_rate": 1.1011342716611678e-05, "loss": 0.8062, "num_input_tokens_seen": 27884672, "step": 48350 }, { "epoch": 7.202114983616324, "grad_norm": 0.28473690152168274, "learning_rate": 1.1005956954166729e-05, "loss": 0.8252, "num_input_tokens_seen": 27887456, "step": 48355 }, { "epoch": 7.202859696157283, "grad_norm": 0.3356371223926544, "learning_rate": 1.1000572137382314e-05, "loss": 0.8107, "num_input_tokens_seen": 27890304, "step": 48360 }, { "epoch": 7.203604408698243, "grad_norm": 0.2462906390428543, "learning_rate": 1.0995188266622324e-05, "loss": 0.8235, "num_input_tokens_seen": 27893472, "step": 48365 }, { "epoch": 7.2043491212392015, "grad_norm": 0.2245781123638153, "learning_rate": 1.0989805342250564e-05, "loss": 0.8005, "num_input_tokens_seen": 27896192, "step": 48370 }, { "epoch": 7.205093833780161, "grad_norm": 0.20440544188022614, "learning_rate": 1.0984423364630796e-05, "loss": 0.8089, "num_input_tokens_seen": 27898848, "step": 48375 }, { "epoch": 7.20583854632112, "grad_norm": 0.377238929271698, "learning_rate": 1.0979042334126724e-05, "loss": 0.8064, "num_input_tokens_seen": 27901984, "step": 48380 }, { "epoch": 7.2065832588620795, "grad_norm": 0.20252719521522522, "learning_rate": 1.0973662251101957e-05, "loss": 0.8084, "num_input_tokens_seen": 27904640, "step": 48385 }, { "epoch": 7.207327971403038, "grad_norm": 0.2456376850605011, "learning_rate": 1.0968283115920067e-05, "loss": 0.8101, "num_input_tokens_seen": 27907232, "step": 48390 }, { "epoch": 7.208072683943998, "grad_norm": 0.24468669295310974, "learning_rate": 1.0962904928944556e-05, "loss": 0.7826, "num_input_tokens_seen": 27910080, "step": 48395 }, { "epoch": 7.208817396484957, "grad_norm": 0.2256956845521927, "learning_rate": 1.095752769053886e-05, "loss": 0.8303, "num_input_tokens_seen": 27913024, "step": 48400 }, { "epoch": 7.209562109025916, "grad_norm": 0.332366019487381, "learning_rate": 1.0952151401066358e-05, "loss": 0.823, "num_input_tokens_seen": 27915616, "step": 48405 }, { "epoch": 7.210306821566875, "grad_norm": 0.24747343361377716, "learning_rate": 1.0946776060890352e-05, "loss": 0.8216, "num_input_tokens_seen": 27918432, "step": 48410 }, { "epoch": 7.211051534107835, "grad_norm": 0.17961928248405457, "learning_rate": 1.0941401670374071e-05, "loss": 0.7982, "num_input_tokens_seen": 27921184, "step": 48415 }, { "epoch": 7.2117962466487935, "grad_norm": 0.26675722002983093, "learning_rate": 1.093602822988071e-05, "loss": 0.8024, "num_input_tokens_seen": 27924000, "step": 48420 }, { "epoch": 7.212540959189753, "grad_norm": 0.20396395027637482, "learning_rate": 1.0930655739773379e-05, "loss": 0.7949, "num_input_tokens_seen": 27927104, "step": 48425 }, { "epoch": 7.213285671730712, "grad_norm": 0.18887688219547272, "learning_rate": 1.0925284200415134e-05, "loss": 0.7818, "num_input_tokens_seen": 27929728, "step": 48430 }, { "epoch": 7.2140303842716715, "grad_norm": 0.20778578519821167, "learning_rate": 1.0919913612168959e-05, "loss": 0.7718, "num_input_tokens_seen": 27932448, "step": 48435 }, { "epoch": 7.21477509681263, "grad_norm": 0.22174647450447083, "learning_rate": 1.0914543975397785e-05, "loss": 0.8132, "num_input_tokens_seen": 27935296, "step": 48440 }, { "epoch": 7.21551980935359, "grad_norm": 0.19495204091072083, "learning_rate": 1.090917529046446e-05, "loss": 0.77, "num_input_tokens_seen": 27938240, "step": 48445 }, { "epoch": 7.216264521894549, "grad_norm": 0.2565392851829529, "learning_rate": 1.0903807557731771e-05, "loss": 0.822, "num_input_tokens_seen": 27941088, "step": 48450 }, { "epoch": 7.217009234435508, "grad_norm": 0.3348724842071533, "learning_rate": 1.0898440777562458e-05, "loss": 0.8334, "num_input_tokens_seen": 27943936, "step": 48455 }, { "epoch": 7.217753946976467, "grad_norm": 0.27039870619773865, "learning_rate": 1.0893074950319182e-05, "loss": 0.8175, "num_input_tokens_seen": 27946880, "step": 48460 }, { "epoch": 7.218498659517426, "grad_norm": 0.26695823669433594, "learning_rate": 1.0887710076364548e-05, "loss": 0.8345, "num_input_tokens_seen": 27949600, "step": 48465 }, { "epoch": 7.2192433720583855, "grad_norm": 0.1855902224779129, "learning_rate": 1.088234615606109e-05, "loss": 0.8194, "num_input_tokens_seen": 27952480, "step": 48470 }, { "epoch": 7.219988084599344, "grad_norm": 0.2706981897354126, "learning_rate": 1.0876983189771292e-05, "loss": 0.7765, "num_input_tokens_seen": 27955232, "step": 48475 }, { "epoch": 7.220732797140304, "grad_norm": 0.18635275959968567, "learning_rate": 1.0871621177857539e-05, "loss": 0.7809, "num_input_tokens_seen": 27957952, "step": 48480 }, { "epoch": 7.221477509681263, "grad_norm": 0.1967485398054123, "learning_rate": 1.0866260120682195e-05, "loss": 0.7802, "num_input_tokens_seen": 27960704, "step": 48485 }, { "epoch": 7.222222222222222, "grad_norm": 0.18997032940387726, "learning_rate": 1.0860900018607518e-05, "loss": 0.8026, "num_input_tokens_seen": 27963552, "step": 48490 }, { "epoch": 7.222966934763181, "grad_norm": 0.26729267835617065, "learning_rate": 1.0855540871995734e-05, "loss": 0.7986, "num_input_tokens_seen": 27966464, "step": 48495 }, { "epoch": 7.223711647304141, "grad_norm": 0.20309703052043915, "learning_rate": 1.085018268120899e-05, "loss": 0.8131, "num_input_tokens_seen": 27969408, "step": 48500 }, { "epoch": 7.2244563598450995, "grad_norm": 0.23272743821144104, "learning_rate": 1.0844825446609368e-05, "loss": 0.7932, "num_input_tokens_seen": 27972288, "step": 48505 }, { "epoch": 7.225201072386059, "grad_norm": 0.18483847379684448, "learning_rate": 1.0839469168558905e-05, "loss": 0.789, "num_input_tokens_seen": 27975296, "step": 48510 }, { "epoch": 7.225945784927018, "grad_norm": 0.1523875743150711, "learning_rate": 1.0834113847419534e-05, "loss": 0.7808, "num_input_tokens_seen": 27978176, "step": 48515 }, { "epoch": 7.2266904974679775, "grad_norm": 0.23843929171562195, "learning_rate": 1.0828759483553152e-05, "loss": 0.7993, "num_input_tokens_seen": 27981216, "step": 48520 }, { "epoch": 7.227435210008936, "grad_norm": 0.20550312101840973, "learning_rate": 1.082340607732159e-05, "loss": 0.8092, "num_input_tokens_seen": 27984096, "step": 48525 }, { "epoch": 7.228179922549896, "grad_norm": 0.23019284009933472, "learning_rate": 1.0818053629086617e-05, "loss": 0.8092, "num_input_tokens_seen": 27986816, "step": 48530 }, { "epoch": 7.228924635090855, "grad_norm": 0.2727133631706238, "learning_rate": 1.081270213920991e-05, "loss": 0.7894, "num_input_tokens_seen": 27989632, "step": 48535 }, { "epoch": 7.229669347631814, "grad_norm": 0.25051149725914, "learning_rate": 1.0807351608053113e-05, "loss": 0.8453, "num_input_tokens_seen": 27992512, "step": 48540 }, { "epoch": 7.230414060172773, "grad_norm": 0.3141750395298004, "learning_rate": 1.0802002035977799e-05, "loss": 0.8176, "num_input_tokens_seen": 27995360, "step": 48545 }, { "epoch": 7.231158772713733, "grad_norm": 0.2893432080745697, "learning_rate": 1.0796653423345452e-05, "loss": 0.8042, "num_input_tokens_seen": 27998464, "step": 48550 }, { "epoch": 7.2319034852546915, "grad_norm": 0.25407135486602783, "learning_rate": 1.079130577051752e-05, "loss": 0.7928, "num_input_tokens_seen": 28001344, "step": 48555 }, { "epoch": 7.232648197795651, "grad_norm": 0.22948984801769257, "learning_rate": 1.0785959077855378e-05, "loss": 0.7828, "num_input_tokens_seen": 28004224, "step": 48560 }, { "epoch": 7.23339291033661, "grad_norm": 0.2333899587392807, "learning_rate": 1.0780613345720331e-05, "loss": 0.7974, "num_input_tokens_seen": 28007008, "step": 48565 }, { "epoch": 7.23413762287757, "grad_norm": 0.2070159912109375, "learning_rate": 1.077526857447363e-05, "loss": 0.7977, "num_input_tokens_seen": 28009792, "step": 48570 }, { "epoch": 7.234882335418528, "grad_norm": 0.31387701630592346, "learning_rate": 1.0769924764476446e-05, "loss": 0.8065, "num_input_tokens_seen": 28012608, "step": 48575 }, { "epoch": 7.235627047959488, "grad_norm": 0.3807637095451355, "learning_rate": 1.0764581916089883e-05, "loss": 0.8018, "num_input_tokens_seen": 28015392, "step": 48580 }, { "epoch": 7.236371760500447, "grad_norm": 0.25479936599731445, "learning_rate": 1.0759240029674994e-05, "loss": 0.7725, "num_input_tokens_seen": 28018240, "step": 48585 }, { "epoch": 7.237116473041406, "grad_norm": 0.21557512879371643, "learning_rate": 1.0753899105592768e-05, "loss": 0.7946, "num_input_tokens_seen": 28021152, "step": 48590 }, { "epoch": 7.237861185582365, "grad_norm": 0.2100597620010376, "learning_rate": 1.0748559144204117e-05, "loss": 0.791, "num_input_tokens_seen": 28024416, "step": 48595 }, { "epoch": 7.238605898123325, "grad_norm": 0.18807242810726166, "learning_rate": 1.07432201458699e-05, "loss": 0.7921, "num_input_tokens_seen": 28027360, "step": 48600 }, { "epoch": 7.2393506106642835, "grad_norm": 0.2066832184791565, "learning_rate": 1.0737882110950911e-05, "loss": 0.8018, "num_input_tokens_seen": 28030368, "step": 48605 }, { "epoch": 7.240095323205243, "grad_norm": 0.224285289645195, "learning_rate": 1.0732545039807862e-05, "loss": 0.7807, "num_input_tokens_seen": 28033216, "step": 48610 }, { "epoch": 7.240840035746202, "grad_norm": 0.22068347036838531, "learning_rate": 1.0727208932801403e-05, "loss": 0.8247, "num_input_tokens_seen": 28036096, "step": 48615 }, { "epoch": 7.241584748287162, "grad_norm": 0.1872699111700058, "learning_rate": 1.0721873790292136e-05, "loss": 0.808, "num_input_tokens_seen": 28038816, "step": 48620 }, { "epoch": 7.24232946082812, "grad_norm": 0.23379206657409668, "learning_rate": 1.0716539612640586e-05, "loss": 0.7819, "num_input_tokens_seen": 28041632, "step": 48625 }, { "epoch": 7.243074173369079, "grad_norm": 0.20393143594264984, "learning_rate": 1.071120640020722e-05, "loss": 0.7989, "num_input_tokens_seen": 28044480, "step": 48630 }, { "epoch": 7.243818885910039, "grad_norm": 0.41562381386756897, "learning_rate": 1.0705874153352428e-05, "loss": 0.8517, "num_input_tokens_seen": 28047936, "step": 48635 }, { "epoch": 7.2445635984509975, "grad_norm": 0.22569802403450012, "learning_rate": 1.0700542872436557e-05, "loss": 0.7873, "num_input_tokens_seen": 28050944, "step": 48640 }, { "epoch": 7.245308310991957, "grad_norm": 0.31155771017074585, "learning_rate": 1.0695212557819851e-05, "loss": 0.7838, "num_input_tokens_seen": 28054240, "step": 48645 }, { "epoch": 7.246053023532916, "grad_norm": 0.2590719759464264, "learning_rate": 1.0689883209862527e-05, "loss": 0.8134, "num_input_tokens_seen": 28057088, "step": 48650 }, { "epoch": 7.246797736073876, "grad_norm": 0.1440112590789795, "learning_rate": 1.0684554828924711e-05, "loss": 0.7908, "num_input_tokens_seen": 28060256, "step": 48655 }, { "epoch": 7.247542448614834, "grad_norm": 0.21979564428329468, "learning_rate": 1.0679227415366475e-05, "loss": 0.7945, "num_input_tokens_seen": 28063264, "step": 48660 }, { "epoch": 7.248287161155794, "grad_norm": 0.2011798769235611, "learning_rate": 1.0673900969547826e-05, "loss": 0.7976, "num_input_tokens_seen": 28066240, "step": 48665 }, { "epoch": 7.249031873696753, "grad_norm": 0.19096656143665314, "learning_rate": 1.0668575491828706e-05, "loss": 0.8066, "num_input_tokens_seen": 28068992, "step": 48670 }, { "epoch": 7.249776586237712, "grad_norm": 0.19379791617393494, "learning_rate": 1.0663250982568993e-05, "loss": 0.798, "num_input_tokens_seen": 28071872, "step": 48675 }, { "epoch": 7.250521298778671, "grad_norm": 0.24742324650287628, "learning_rate": 1.0657927442128482e-05, "loss": 0.8095, "num_input_tokens_seen": 28074944, "step": 48680 }, { "epoch": 7.251266011319631, "grad_norm": 0.18587560951709747, "learning_rate": 1.0652604870866923e-05, "loss": 0.8272, "num_input_tokens_seen": 28077824, "step": 48685 }, { "epoch": 7.2520107238605895, "grad_norm": 0.2053842395544052, "learning_rate": 1.0647283269144003e-05, "loss": 0.8037, "num_input_tokens_seen": 28080672, "step": 48690 }, { "epoch": 7.252755436401549, "grad_norm": 0.1999431550502777, "learning_rate": 1.064196263731932e-05, "loss": 0.7995, "num_input_tokens_seen": 28083520, "step": 48695 }, { "epoch": 7.253500148942508, "grad_norm": 0.226762056350708, "learning_rate": 1.0636642975752423e-05, "loss": 0.7828, "num_input_tokens_seen": 28086464, "step": 48700 }, { "epoch": 7.254244861483468, "grad_norm": 0.21846401691436768, "learning_rate": 1.0631324284802799e-05, "loss": 0.7712, "num_input_tokens_seen": 28089824, "step": 48705 }, { "epoch": 7.254989574024426, "grad_norm": 0.19030210375785828, "learning_rate": 1.0626006564829868e-05, "loss": 0.7827, "num_input_tokens_seen": 28092384, "step": 48710 }, { "epoch": 7.255734286565386, "grad_norm": 0.21352334320545197, "learning_rate": 1.0620689816192967e-05, "loss": 0.8543, "num_input_tokens_seen": 28095136, "step": 48715 }, { "epoch": 7.256478999106345, "grad_norm": 0.22553765773773193, "learning_rate": 1.0615374039251382e-05, "loss": 0.8153, "num_input_tokens_seen": 28098080, "step": 48720 }, { "epoch": 7.257223711647304, "grad_norm": 0.22461900115013123, "learning_rate": 1.061005923436434e-05, "loss": 0.8177, "num_input_tokens_seen": 28101088, "step": 48725 }, { "epoch": 7.257968424188263, "grad_norm": 0.17555324733257294, "learning_rate": 1.0604745401890997e-05, "loss": 0.7813, "num_input_tokens_seen": 28103744, "step": 48730 }, { "epoch": 7.258713136729223, "grad_norm": 0.1910294145345688, "learning_rate": 1.0599432542190424e-05, "loss": 0.7876, "num_input_tokens_seen": 28106400, "step": 48735 }, { "epoch": 7.259457849270182, "grad_norm": 0.2087005376815796, "learning_rate": 1.0594120655621659e-05, "loss": 0.7829, "num_input_tokens_seen": 28109280, "step": 48740 }, { "epoch": 7.260202561811141, "grad_norm": 0.11803470551967621, "learning_rate": 1.0588809742543643e-05, "loss": 0.7829, "num_input_tokens_seen": 28111968, "step": 48745 }, { "epoch": 7.2609472743521, "grad_norm": 0.2090468555688858, "learning_rate": 1.0583499803315271e-05, "loss": 0.7656, "num_input_tokens_seen": 28115040, "step": 48750 }, { "epoch": 7.26169198689306, "grad_norm": 0.1864166110754013, "learning_rate": 1.0578190838295371e-05, "loss": 0.7785, "num_input_tokens_seen": 28118016, "step": 48755 }, { "epoch": 7.262436699434018, "grad_norm": 0.2287910133600235, "learning_rate": 1.0572882847842696e-05, "loss": 0.795, "num_input_tokens_seen": 28120896, "step": 48760 }, { "epoch": 7.263181411974978, "grad_norm": 0.1574554592370987, "learning_rate": 1.0567575832315947e-05, "loss": 0.8063, "num_input_tokens_seen": 28123648, "step": 48765 }, { "epoch": 7.263926124515937, "grad_norm": 0.17525449395179749, "learning_rate": 1.056226979207375e-05, "loss": 0.7663, "num_input_tokens_seen": 28126720, "step": 48770 }, { "epoch": 7.264670837056896, "grad_norm": 0.19121603667736053, "learning_rate": 1.0556964727474664e-05, "loss": 0.7752, "num_input_tokens_seen": 28129664, "step": 48775 }, { "epoch": 7.265415549597855, "grad_norm": 0.22849762439727783, "learning_rate": 1.055166063887717e-05, "loss": 0.7885, "num_input_tokens_seen": 28132768, "step": 48780 }, { "epoch": 7.266160262138815, "grad_norm": 0.25282758474349976, "learning_rate": 1.0546357526639705e-05, "loss": 0.746, "num_input_tokens_seen": 28136352, "step": 48785 }, { "epoch": 7.266904974679774, "grad_norm": 0.18995420634746552, "learning_rate": 1.0541055391120638e-05, "loss": 0.7765, "num_input_tokens_seen": 28139232, "step": 48790 }, { "epoch": 7.267649687220732, "grad_norm": 0.16073046624660492, "learning_rate": 1.053575423267826e-05, "loss": 0.8128, "num_input_tokens_seen": 28141984, "step": 48795 }, { "epoch": 7.268394399761692, "grad_norm": 0.30547353625297546, "learning_rate": 1.0530454051670805e-05, "loss": 0.7842, "num_input_tokens_seen": 28144768, "step": 48800 }, { "epoch": 7.269139112302652, "grad_norm": 0.2349681407213211, "learning_rate": 1.0525154848456442e-05, "loss": 0.7947, "num_input_tokens_seen": 28147552, "step": 48805 }, { "epoch": 7.26988382484361, "grad_norm": 0.2916005551815033, "learning_rate": 1.0519856623393268e-05, "loss": 0.8155, "num_input_tokens_seen": 28150144, "step": 48810 }, { "epoch": 7.270628537384569, "grad_norm": 0.255790114402771, "learning_rate": 1.05145593768393e-05, "loss": 0.7814, "num_input_tokens_seen": 28153088, "step": 48815 }, { "epoch": 7.271373249925529, "grad_norm": 0.20684653520584106, "learning_rate": 1.0509263109152518e-05, "loss": 0.782, "num_input_tokens_seen": 28156096, "step": 48820 }, { "epoch": 7.272117962466488, "grad_norm": 0.18979206681251526, "learning_rate": 1.0503967820690817e-05, "loss": 0.8027, "num_input_tokens_seen": 28159008, "step": 48825 }, { "epoch": 7.272862675007447, "grad_norm": 0.2161402553319931, "learning_rate": 1.0498673511812035e-05, "loss": 0.7828, "num_input_tokens_seen": 28161984, "step": 48830 }, { "epoch": 7.273607387548406, "grad_norm": 0.18713267147541046, "learning_rate": 1.0493380182873941e-05, "loss": 0.7918, "num_input_tokens_seen": 28164672, "step": 48835 }, { "epoch": 7.274352100089366, "grad_norm": 0.27093377709388733, "learning_rate": 1.0488087834234242e-05, "loss": 0.8245, "num_input_tokens_seen": 28167424, "step": 48840 }, { "epoch": 7.275096812630324, "grad_norm": 0.213261216878891, "learning_rate": 1.0482796466250554e-05, "loss": 0.7657, "num_input_tokens_seen": 28170400, "step": 48845 }, { "epoch": 7.275841525171284, "grad_norm": 0.1765131801366806, "learning_rate": 1.0477506079280472e-05, "loss": 0.7905, "num_input_tokens_seen": 28173824, "step": 48850 }, { "epoch": 7.276586237712243, "grad_norm": 0.24634386599063873, "learning_rate": 1.0472216673681476e-05, "loss": 0.7978, "num_input_tokens_seen": 28176416, "step": 48855 }, { "epoch": 7.277330950253202, "grad_norm": 0.2790469229221344, "learning_rate": 1.0466928249811009e-05, "loss": 0.7953, "num_input_tokens_seen": 28179424, "step": 48860 }, { "epoch": 7.278075662794161, "grad_norm": 0.23126977682113647, "learning_rate": 1.0461640808026444e-05, "loss": 0.8066, "num_input_tokens_seen": 28182368, "step": 48865 }, { "epoch": 7.278820375335121, "grad_norm": 0.24249430000782013, "learning_rate": 1.0456354348685085e-05, "loss": 0.7801, "num_input_tokens_seen": 28185504, "step": 48870 }, { "epoch": 7.27956508787608, "grad_norm": 0.18642400205135345, "learning_rate": 1.0451068872144179e-05, "loss": 0.7934, "num_input_tokens_seen": 28188256, "step": 48875 }, { "epoch": 7.280309800417039, "grad_norm": 0.20855233073234558, "learning_rate": 1.0445784378760876e-05, "loss": 0.8252, "num_input_tokens_seen": 28191200, "step": 48880 }, { "epoch": 7.281054512957998, "grad_norm": 0.22521981596946716, "learning_rate": 1.0440500868892294e-05, "loss": 0.788, "num_input_tokens_seen": 28194144, "step": 48885 }, { "epoch": 7.281799225498958, "grad_norm": 0.20144256949424744, "learning_rate": 1.0435218342895475e-05, "loss": 0.7976, "num_input_tokens_seen": 28196768, "step": 48890 }, { "epoch": 7.282543938039916, "grad_norm": 0.1525195688009262, "learning_rate": 1.0429936801127377e-05, "loss": 0.7884, "num_input_tokens_seen": 28199776, "step": 48895 }, { "epoch": 7.283288650580876, "grad_norm": 0.17108795046806335, "learning_rate": 1.0424656243944913e-05, "loss": 0.7859, "num_input_tokens_seen": 28202560, "step": 48900 }, { "epoch": 7.284033363121835, "grad_norm": 0.29242706298828125, "learning_rate": 1.0419376671704928e-05, "loss": 0.8035, "num_input_tokens_seen": 28205632, "step": 48905 }, { "epoch": 7.2847780756627944, "grad_norm": 0.42633405327796936, "learning_rate": 1.0414098084764178e-05, "loss": 0.8411, "num_input_tokens_seen": 28208320, "step": 48910 }, { "epoch": 7.285522788203753, "grad_norm": 0.26494866609573364, "learning_rate": 1.0408820483479382e-05, "loss": 0.7807, "num_input_tokens_seen": 28211584, "step": 48915 }, { "epoch": 7.286267500744713, "grad_norm": 0.2252727597951889, "learning_rate": 1.0403543868207169e-05, "loss": 0.8056, "num_input_tokens_seen": 28214688, "step": 48920 }, { "epoch": 7.287012213285672, "grad_norm": 0.17529761791229248, "learning_rate": 1.0398268239304118e-05, "loss": 0.814, "num_input_tokens_seen": 28217440, "step": 48925 }, { "epoch": 7.287756925826631, "grad_norm": 0.23808647692203522, "learning_rate": 1.0392993597126743e-05, "loss": 0.798, "num_input_tokens_seen": 28220384, "step": 48930 }, { "epoch": 7.28850163836759, "grad_norm": 0.1590505987405777, "learning_rate": 1.0387719942031462e-05, "loss": 0.831, "num_input_tokens_seen": 28223040, "step": 48935 }, { "epoch": 7.28924635090855, "grad_norm": 0.21950550377368927, "learning_rate": 1.0382447274374667e-05, "loss": 0.7977, "num_input_tokens_seen": 28225632, "step": 48940 }, { "epoch": 7.289991063449508, "grad_norm": 0.2529800236225128, "learning_rate": 1.037717559451265e-05, "loss": 0.8039, "num_input_tokens_seen": 28228640, "step": 48945 }, { "epoch": 7.290735775990468, "grad_norm": 0.22369180619716644, "learning_rate": 1.0371904902801651e-05, "loss": 0.7836, "num_input_tokens_seen": 28231520, "step": 48950 }, { "epoch": 7.291480488531427, "grad_norm": 0.14854396879673004, "learning_rate": 1.0366635199597846e-05, "loss": 0.8067, "num_input_tokens_seen": 28234496, "step": 48955 }, { "epoch": 7.292225201072386, "grad_norm": 0.2127370685338974, "learning_rate": 1.0361366485257338e-05, "loss": 0.78, "num_input_tokens_seen": 28237568, "step": 48960 }, { "epoch": 7.292969913613345, "grad_norm": 0.2068774700164795, "learning_rate": 1.0356098760136168e-05, "loss": 0.8143, "num_input_tokens_seen": 28240640, "step": 48965 }, { "epoch": 7.293714626154305, "grad_norm": 0.2161387801170349, "learning_rate": 1.0350832024590312e-05, "loss": 0.8034, "num_input_tokens_seen": 28243616, "step": 48970 }, { "epoch": 7.294459338695264, "grad_norm": 0.17038092017173767, "learning_rate": 1.0345566278975671e-05, "loss": 0.7727, "num_input_tokens_seen": 28246848, "step": 48975 }, { "epoch": 7.295204051236222, "grad_norm": 0.17784623801708221, "learning_rate": 1.0340301523648071e-05, "loss": 0.7911, "num_input_tokens_seen": 28249600, "step": 48980 }, { "epoch": 7.295948763777182, "grad_norm": 0.33608278632164, "learning_rate": 1.0335037758963296e-05, "loss": 0.7852, "num_input_tokens_seen": 28252448, "step": 48985 }, { "epoch": 7.296693476318141, "grad_norm": 0.22951480746269226, "learning_rate": 1.0329774985277042e-05, "loss": 0.7724, "num_input_tokens_seen": 28255616, "step": 48990 }, { "epoch": 7.2974381888591004, "grad_norm": 0.2517758905887604, "learning_rate": 1.0324513202944947e-05, "loss": 0.7793, "num_input_tokens_seen": 28258720, "step": 48995 }, { "epoch": 7.298182901400059, "grad_norm": 0.19673115015029907, "learning_rate": 1.0319252412322586e-05, "loss": 0.8207, "num_input_tokens_seen": 28262016, "step": 49000 }, { "epoch": 7.298927613941019, "grad_norm": 0.2528778612613678, "learning_rate": 1.0313992613765469e-05, "loss": 0.7882, "num_input_tokens_seen": 28264832, "step": 49005 }, { "epoch": 7.299672326481978, "grad_norm": 0.1751278042793274, "learning_rate": 1.0308733807629022e-05, "loss": 0.8131, "num_input_tokens_seen": 28267488, "step": 49010 }, { "epoch": 7.300417039022937, "grad_norm": 0.17474713921546936, "learning_rate": 1.0303475994268606e-05, "loss": 0.7827, "num_input_tokens_seen": 28270432, "step": 49015 }, { "epoch": 7.301161751563896, "grad_norm": 0.18314974009990692, "learning_rate": 1.029821917403953e-05, "loss": 0.8146, "num_input_tokens_seen": 28273280, "step": 49020 }, { "epoch": 7.301906464104856, "grad_norm": 0.30161669850349426, "learning_rate": 1.0292963347297027e-05, "loss": 0.7703, "num_input_tokens_seen": 28276352, "step": 49025 }, { "epoch": 7.302651176645814, "grad_norm": 0.22233767807483673, "learning_rate": 1.0287708514396268e-05, "loss": 0.7878, "num_input_tokens_seen": 28279360, "step": 49030 }, { "epoch": 7.303395889186774, "grad_norm": 0.16078485548496246, "learning_rate": 1.0282454675692354e-05, "loss": 0.8231, "num_input_tokens_seen": 28282176, "step": 49035 }, { "epoch": 7.304140601727733, "grad_norm": 0.2877315878868103, "learning_rate": 1.0277201831540323e-05, "loss": 0.8288, "num_input_tokens_seen": 28285248, "step": 49040 }, { "epoch": 7.3048853142686925, "grad_norm": 0.2578139305114746, "learning_rate": 1.0271949982295123e-05, "loss": 0.8003, "num_input_tokens_seen": 28288032, "step": 49045 }, { "epoch": 7.305630026809651, "grad_norm": 0.18884073197841644, "learning_rate": 1.0266699128311675e-05, "loss": 0.8, "num_input_tokens_seen": 28291072, "step": 49050 }, { "epoch": 7.306374739350611, "grad_norm": 0.21435537934303284, "learning_rate": 1.0261449269944786e-05, "loss": 0.7763, "num_input_tokens_seen": 28293856, "step": 49055 }, { "epoch": 7.30711945189157, "grad_norm": 0.24594342708587646, "learning_rate": 1.0256200407549239e-05, "loss": 0.7815, "num_input_tokens_seen": 28296608, "step": 49060 }, { "epoch": 7.307864164432529, "grad_norm": 0.2426760345697403, "learning_rate": 1.0250952541479719e-05, "loss": 0.7971, "num_input_tokens_seen": 28299264, "step": 49065 }, { "epoch": 7.308608876973488, "grad_norm": 0.2370838075876236, "learning_rate": 1.0245705672090872e-05, "loss": 0.8131, "num_input_tokens_seen": 28302304, "step": 49070 }, { "epoch": 7.309353589514448, "grad_norm": 0.19739307463169098, "learning_rate": 1.024045979973724e-05, "loss": 0.7944, "num_input_tokens_seen": 28305024, "step": 49075 }, { "epoch": 7.3100983020554064, "grad_norm": 0.25482189655303955, "learning_rate": 1.0235214924773326e-05, "loss": 0.8081, "num_input_tokens_seen": 28307904, "step": 49080 }, { "epoch": 7.310843014596366, "grad_norm": 0.19153285026550293, "learning_rate": 1.0229971047553557e-05, "loss": 0.818, "num_input_tokens_seen": 28310880, "step": 49085 }, { "epoch": 7.311587727137325, "grad_norm": 0.19329488277435303, "learning_rate": 1.0224728168432307e-05, "loss": 0.7717, "num_input_tokens_seen": 28314112, "step": 49090 }, { "epoch": 7.3123324396782845, "grad_norm": 0.18821260333061218, "learning_rate": 1.0219486287763844e-05, "loss": 0.7953, "num_input_tokens_seen": 28317088, "step": 49095 }, { "epoch": 7.313077152219243, "grad_norm": 0.28171125054359436, "learning_rate": 1.0214245405902406e-05, "loss": 0.8175, "num_input_tokens_seen": 28319872, "step": 49100 }, { "epoch": 7.313821864760203, "grad_norm": 0.2132711559534073, "learning_rate": 1.0209005523202155e-05, "loss": 0.8009, "num_input_tokens_seen": 28322784, "step": 49105 }, { "epoch": 7.314566577301162, "grad_norm": 0.22859175503253937, "learning_rate": 1.0203766640017167e-05, "loss": 0.8063, "num_input_tokens_seen": 28325536, "step": 49110 }, { "epoch": 7.315311289842121, "grad_norm": 0.25473153591156006, "learning_rate": 1.0198528756701475e-05, "loss": 0.7844, "num_input_tokens_seen": 28328224, "step": 49115 }, { "epoch": 7.31605600238308, "grad_norm": 0.14930540323257446, "learning_rate": 1.019329187360903e-05, "loss": 0.7967, "num_input_tokens_seen": 28331328, "step": 49120 }, { "epoch": 7.31680071492404, "grad_norm": 0.3184463083744049, "learning_rate": 1.0188055991093717e-05, "loss": 0.8172, "num_input_tokens_seen": 28334048, "step": 49125 }, { "epoch": 7.3175454274649985, "grad_norm": 0.20277312397956848, "learning_rate": 1.0182821109509364e-05, "loss": 0.8046, "num_input_tokens_seen": 28337152, "step": 49130 }, { "epoch": 7.318290140005958, "grad_norm": 0.2640363276004791, "learning_rate": 1.0177587229209726e-05, "loss": 0.8311, "num_input_tokens_seen": 28339776, "step": 49135 }, { "epoch": 7.319034852546917, "grad_norm": 0.2604680359363556, "learning_rate": 1.0172354350548477e-05, "loss": 0.77, "num_input_tokens_seen": 28342848, "step": 49140 }, { "epoch": 7.319779565087876, "grad_norm": 0.19551461935043335, "learning_rate": 1.0167122473879228e-05, "loss": 0.7908, "num_input_tokens_seen": 28345920, "step": 49145 }, { "epoch": 7.320524277628835, "grad_norm": 0.2371763288974762, "learning_rate": 1.0161891599555536e-05, "loss": 0.7781, "num_input_tokens_seen": 28348960, "step": 49150 }, { "epoch": 7.321268990169794, "grad_norm": 0.24115607142448425, "learning_rate": 1.0156661727930886e-05, "loss": 0.81, "num_input_tokens_seen": 28351776, "step": 49155 }, { "epoch": 7.322013702710754, "grad_norm": 0.1945493519306183, "learning_rate": 1.0151432859358684e-05, "loss": 0.78, "num_input_tokens_seen": 28354496, "step": 49160 }, { "epoch": 7.3227584152517124, "grad_norm": 0.23901380598545074, "learning_rate": 1.0146204994192283e-05, "loss": 0.8031, "num_input_tokens_seen": 28357472, "step": 49165 }, { "epoch": 7.323503127792672, "grad_norm": 0.1881811022758484, "learning_rate": 1.0140978132784962e-05, "loss": 0.8018, "num_input_tokens_seen": 28360640, "step": 49170 }, { "epoch": 7.324247840333631, "grad_norm": 0.18399247527122498, "learning_rate": 1.013575227548993e-05, "loss": 0.7942, "num_input_tokens_seen": 28363552, "step": 49175 }, { "epoch": 7.3249925528745905, "grad_norm": 0.265279620885849, "learning_rate": 1.0130527422660313e-05, "loss": 0.8351, "num_input_tokens_seen": 28366432, "step": 49180 }, { "epoch": 7.325737265415549, "grad_norm": 0.23678356409072876, "learning_rate": 1.01253035746492e-05, "loss": 0.8047, "num_input_tokens_seen": 28369408, "step": 49185 }, { "epoch": 7.326481977956509, "grad_norm": 0.2126317173242569, "learning_rate": 1.0120080731809595e-05, "loss": 0.7921, "num_input_tokens_seen": 28372416, "step": 49190 }, { "epoch": 7.327226690497468, "grad_norm": 0.22149139642715454, "learning_rate": 1.0114858894494437e-05, "loss": 0.8142, "num_input_tokens_seen": 28375360, "step": 49195 }, { "epoch": 7.327971403038427, "grad_norm": 0.1683373749256134, "learning_rate": 1.0109638063056595e-05, "loss": 0.788, "num_input_tokens_seen": 28378272, "step": 49200 }, { "epoch": 7.328716115579386, "grad_norm": 0.20413658022880554, "learning_rate": 1.0104418237848883e-05, "loss": 0.802, "num_input_tokens_seen": 28380992, "step": 49205 }, { "epoch": 7.329460828120346, "grad_norm": 0.2507375180721283, "learning_rate": 1.0099199419224018e-05, "loss": 0.8048, "num_input_tokens_seen": 28383712, "step": 49210 }, { "epoch": 7.3302055406613045, "grad_norm": 0.24169740080833435, "learning_rate": 1.0093981607534683e-05, "loss": 0.7769, "num_input_tokens_seen": 28386496, "step": 49215 }, { "epoch": 7.330950253202264, "grad_norm": 0.2493925839662552, "learning_rate": 1.0088764803133454e-05, "loss": 0.7803, "num_input_tokens_seen": 28389184, "step": 49220 }, { "epoch": 7.331694965743223, "grad_norm": 0.22140467166900635, "learning_rate": 1.0083549006372881e-05, "loss": 0.8371, "num_input_tokens_seen": 28391904, "step": 49225 }, { "epoch": 7.3324396782841825, "grad_norm": 0.25170478224754333, "learning_rate": 1.0078334217605418e-05, "loss": 0.7969, "num_input_tokens_seen": 28394720, "step": 49230 }, { "epoch": 7.333184390825141, "grad_norm": 0.21940948069095612, "learning_rate": 1.007312043718347e-05, "loss": 0.8135, "num_input_tokens_seen": 28397408, "step": 49235 }, { "epoch": 7.333929103366101, "grad_norm": 0.27561941742897034, "learning_rate": 1.0067907665459344e-05, "loss": 0.7942, "num_input_tokens_seen": 28400192, "step": 49240 }, { "epoch": 7.33467381590706, "grad_norm": 0.24133792519569397, "learning_rate": 1.006269590278531e-05, "loss": 0.8009, "num_input_tokens_seen": 28403040, "step": 49245 }, { "epoch": 7.335418528448019, "grad_norm": 0.23616880178451538, "learning_rate": 1.0057485149513557e-05, "loss": 0.8142, "num_input_tokens_seen": 28405792, "step": 49250 }, { "epoch": 7.336163240988978, "grad_norm": 0.21342185139656067, "learning_rate": 1.0052275405996214e-05, "loss": 0.7868, "num_input_tokens_seen": 28408992, "step": 49255 }, { "epoch": 7.336907953529938, "grad_norm": 0.22358006238937378, "learning_rate": 1.0047066672585317e-05, "loss": 0.7919, "num_input_tokens_seen": 28411936, "step": 49260 }, { "epoch": 7.3376526660708965, "grad_norm": 0.19771285355091095, "learning_rate": 1.004185894963286e-05, "loss": 0.8103, "num_input_tokens_seen": 28414848, "step": 49265 }, { "epoch": 7.338397378611856, "grad_norm": 0.15413400530815125, "learning_rate": 1.0036652237490768e-05, "loss": 0.793, "num_input_tokens_seen": 28417728, "step": 49270 }, { "epoch": 7.339142091152815, "grad_norm": 0.22657547891139984, "learning_rate": 1.0031446536510875e-05, "loss": 0.8015, "num_input_tokens_seen": 28420768, "step": 49275 }, { "epoch": 7.3398868036937746, "grad_norm": 0.13822753727436066, "learning_rate": 1.0026241847044964e-05, "loss": 0.7649, "num_input_tokens_seen": 28423808, "step": 49280 }, { "epoch": 7.340631516234733, "grad_norm": 0.25291287899017334, "learning_rate": 1.0021038169444752e-05, "loss": 0.8171, "num_input_tokens_seen": 28426560, "step": 49285 }, { "epoch": 7.341376228775693, "grad_norm": 0.24056601524353027, "learning_rate": 1.0015835504061879e-05, "loss": 0.8334, "num_input_tokens_seen": 28429184, "step": 49290 }, { "epoch": 7.342120941316652, "grad_norm": 0.25681763887405396, "learning_rate": 1.0010633851247933e-05, "loss": 0.803, "num_input_tokens_seen": 28431808, "step": 49295 }, { "epoch": 7.342865653857611, "grad_norm": 0.18049804866313934, "learning_rate": 1.0005433211354398e-05, "loss": 0.7901, "num_input_tokens_seen": 28434624, "step": 49300 }, { "epoch": 7.34361036639857, "grad_norm": 0.20847870409488678, "learning_rate": 1.0000233584732732e-05, "loss": 0.8323, "num_input_tokens_seen": 28437344, "step": 49305 }, { "epoch": 7.344355078939529, "grad_norm": 0.19736123085021973, "learning_rate": 9.995034971734288e-06, "loss": 0.7969, "num_input_tokens_seen": 28440224, "step": 49310 }, { "epoch": 7.3450997914804885, "grad_norm": 0.22154121100902557, "learning_rate": 9.989837372710374e-06, "loss": 0.7964, "num_input_tokens_seen": 28443008, "step": 49315 }, { "epoch": 7.345844504021448, "grad_norm": 0.21735329926013947, "learning_rate": 9.984640788012222e-06, "loss": 0.7468, "num_input_tokens_seen": 28447456, "step": 49320 }, { "epoch": 7.346589216562407, "grad_norm": 0.21012920141220093, "learning_rate": 9.979445217991001e-06, "loss": 0.8205, "num_input_tokens_seen": 28450464, "step": 49325 }, { "epoch": 7.347333929103366, "grad_norm": 0.19887128472328186, "learning_rate": 9.9742506629978e-06, "loss": 0.7837, "num_input_tokens_seen": 28453376, "step": 49330 }, { "epoch": 7.348078641644325, "grad_norm": 0.2310156226158142, "learning_rate": 9.96905712338366e-06, "loss": 0.843, "num_input_tokens_seen": 28456160, "step": 49335 }, { "epoch": 7.348823354185284, "grad_norm": 0.3160587251186371, "learning_rate": 9.963864599499528e-06, "loss": 0.7826, "num_input_tokens_seen": 28458848, "step": 49340 }, { "epoch": 7.349568066726244, "grad_norm": 0.1671856939792633, "learning_rate": 9.958673091696286e-06, "loss": 0.7889, "num_input_tokens_seen": 28462080, "step": 49345 }, { "epoch": 7.3503127792672025, "grad_norm": 0.2141360193490982, "learning_rate": 9.95348260032476e-06, "loss": 0.7733, "num_input_tokens_seen": 28464864, "step": 49350 }, { "epoch": 7.351057491808162, "grad_norm": 0.24706263840198517, "learning_rate": 9.948293125735705e-06, "loss": 0.802, "num_input_tokens_seen": 28467904, "step": 49355 }, { "epoch": 7.351802204349121, "grad_norm": 0.2743144929409027, "learning_rate": 9.943104668279807e-06, "loss": 0.8057, "num_input_tokens_seen": 28471072, "step": 49360 }, { "epoch": 7.3525469168900806, "grad_norm": 0.25893452763557434, "learning_rate": 9.937917228307678e-06, "loss": 0.7776, "num_input_tokens_seen": 28473856, "step": 49365 }, { "epoch": 7.353291629431039, "grad_norm": 0.17148567736148834, "learning_rate": 9.932730806169873e-06, "loss": 0.8051, "num_input_tokens_seen": 28476608, "step": 49370 }, { "epoch": 7.354036341971999, "grad_norm": 0.3121100962162018, "learning_rate": 9.927545402216862e-06, "loss": 0.8224, "num_input_tokens_seen": 28479808, "step": 49375 }, { "epoch": 7.354781054512958, "grad_norm": 0.3113688826560974, "learning_rate": 9.922361016799045e-06, "loss": 0.8245, "num_input_tokens_seen": 28482752, "step": 49380 }, { "epoch": 7.355525767053917, "grad_norm": 0.2201630175113678, "learning_rate": 9.917177650266768e-06, "loss": 0.7917, "num_input_tokens_seen": 28485536, "step": 49385 }, { "epoch": 7.356270479594876, "grad_norm": 0.2900923788547516, "learning_rate": 9.911995302970301e-06, "loss": 0.7813, "num_input_tokens_seen": 28488608, "step": 49390 }, { "epoch": 7.357015192135836, "grad_norm": 0.17953450977802277, "learning_rate": 9.90681397525985e-06, "loss": 0.7679, "num_input_tokens_seen": 28491328, "step": 49395 }, { "epoch": 7.3577599046767945, "grad_norm": 0.28133413195610046, "learning_rate": 9.901633667485554e-06, "loss": 0.7897, "num_input_tokens_seen": 28494176, "step": 49400 }, { "epoch": 7.358504617217754, "grad_norm": 0.2554020583629608, "learning_rate": 9.89645437999746e-06, "loss": 0.8153, "num_input_tokens_seen": 28496864, "step": 49405 }, { "epoch": 7.359249329758713, "grad_norm": 0.22284077107906342, "learning_rate": 9.891276113145576e-06, "loss": 0.7974, "num_input_tokens_seen": 28499488, "step": 49410 }, { "epoch": 7.359994042299673, "grad_norm": 0.21135026216506958, "learning_rate": 9.886098867279831e-06, "loss": 0.7918, "num_input_tokens_seen": 28502400, "step": 49415 }, { "epoch": 7.360738754840631, "grad_norm": 0.19126230478286743, "learning_rate": 9.880922642750068e-06, "loss": 0.7987, "num_input_tokens_seen": 28505088, "step": 49420 }, { "epoch": 7.361483467381591, "grad_norm": 0.1955944448709488, "learning_rate": 9.87574743990608e-06, "loss": 0.8011, "num_input_tokens_seen": 28507936, "step": 49425 }, { "epoch": 7.36222817992255, "grad_norm": 0.15685699880123138, "learning_rate": 9.870573259097593e-06, "loss": 0.8183, "num_input_tokens_seen": 28510848, "step": 49430 }, { "epoch": 7.362972892463509, "grad_norm": 0.2413960099220276, "learning_rate": 9.86540010067426e-06, "loss": 0.7805, "num_input_tokens_seen": 28513536, "step": 49435 }, { "epoch": 7.363717605004468, "grad_norm": 0.20745660364627838, "learning_rate": 9.86022796498565e-06, "loss": 0.7602, "num_input_tokens_seen": 28516416, "step": 49440 }, { "epoch": 7.364462317545428, "grad_norm": 0.21366697549819946, "learning_rate": 9.855056852381275e-06, "loss": 0.8049, "num_input_tokens_seen": 28519328, "step": 49445 }, { "epoch": 7.3652070300863866, "grad_norm": 0.2485179454088211, "learning_rate": 9.849886763210586e-06, "loss": 0.8114, "num_input_tokens_seen": 28522496, "step": 49450 }, { "epoch": 7.365951742627346, "grad_norm": 0.2599147856235504, "learning_rate": 9.844717697822965e-06, "loss": 0.793, "num_input_tokens_seen": 28525440, "step": 49455 }, { "epoch": 7.366696455168305, "grad_norm": 0.26948657631874084, "learning_rate": 9.839549656567693e-06, "loss": 0.8139, "num_input_tokens_seen": 28528224, "step": 49460 }, { "epoch": 7.367441167709265, "grad_norm": 0.34071093797683716, "learning_rate": 9.834382639794015e-06, "loss": 0.821, "num_input_tokens_seen": 28531104, "step": 49465 }, { "epoch": 7.368185880250223, "grad_norm": 0.1932419389486313, "learning_rate": 9.829216647851111e-06, "loss": 0.8072, "num_input_tokens_seen": 28533696, "step": 49470 }, { "epoch": 7.368930592791183, "grad_norm": 0.23709778487682343, "learning_rate": 9.824051681088058e-06, "loss": 0.781, "num_input_tokens_seen": 28536768, "step": 49475 }, { "epoch": 7.369675305332142, "grad_norm": 0.2609577178955078, "learning_rate": 9.81888773985389e-06, "loss": 0.8141, "num_input_tokens_seen": 28539392, "step": 49480 }, { "epoch": 7.370420017873101, "grad_norm": 0.15356718003749847, "learning_rate": 9.81372482449757e-06, "loss": 0.8065, "num_input_tokens_seen": 28542272, "step": 49485 }, { "epoch": 7.37116473041406, "grad_norm": 0.1851210743188858, "learning_rate": 9.80856293536798e-06, "loss": 0.8179, "num_input_tokens_seen": 28544928, "step": 49490 }, { "epoch": 7.371909442955019, "grad_norm": 0.19727814197540283, "learning_rate": 9.803402072813953e-06, "loss": 0.8448, "num_input_tokens_seen": 28547712, "step": 49495 }, { "epoch": 7.372654155495979, "grad_norm": 0.25716283917427063, "learning_rate": 9.798242237184218e-06, "loss": 0.8078, "num_input_tokens_seen": 28550752, "step": 49500 }, { "epoch": 7.373398868036937, "grad_norm": 0.20736421644687653, "learning_rate": 9.793083428827477e-06, "loss": 0.7847, "num_input_tokens_seen": 28553632, "step": 49505 }, { "epoch": 7.374143580577897, "grad_norm": 0.22675763070583344, "learning_rate": 9.787925648092321e-06, "loss": 0.78, "num_input_tokens_seen": 28556608, "step": 49510 }, { "epoch": 7.374888293118856, "grad_norm": 0.19134320318698883, "learning_rate": 9.782768895327305e-06, "loss": 0.7766, "num_input_tokens_seen": 28559552, "step": 49515 }, { "epoch": 7.375633005659815, "grad_norm": 0.19361895322799683, "learning_rate": 9.777613170880898e-06, "loss": 0.8035, "num_input_tokens_seen": 28562272, "step": 49520 }, { "epoch": 7.376377718200774, "grad_norm": 0.23900945484638214, "learning_rate": 9.7724584751015e-06, "loss": 0.8144, "num_input_tokens_seen": 28565152, "step": 49525 }, { "epoch": 7.377122430741734, "grad_norm": 0.1867794692516327, "learning_rate": 9.767304808337451e-06, "loss": 0.7933, "num_input_tokens_seen": 28568096, "step": 49530 }, { "epoch": 7.3778671432826926, "grad_norm": 0.22214438021183014, "learning_rate": 9.76215217093702e-06, "loss": 0.786, "num_input_tokens_seen": 28570688, "step": 49535 }, { "epoch": 7.378611855823652, "grad_norm": 0.34758660197257996, "learning_rate": 9.757000563248389e-06, "loss": 0.8138, "num_input_tokens_seen": 28573856, "step": 49540 }, { "epoch": 7.379356568364611, "grad_norm": 0.22731390595436096, "learning_rate": 9.751849985619682e-06, "loss": 0.8315, "num_input_tokens_seen": 28576768, "step": 49545 }, { "epoch": 7.380101280905571, "grad_norm": 0.26985809206962585, "learning_rate": 9.746700438398957e-06, "loss": 0.8016, "num_input_tokens_seen": 28579584, "step": 49550 }, { "epoch": 7.380845993446529, "grad_norm": 0.21373289823532104, "learning_rate": 9.7415519219342e-06, "loss": 0.8311, "num_input_tokens_seen": 28582336, "step": 49555 }, { "epoch": 7.381590705987489, "grad_norm": 0.2233532965183258, "learning_rate": 9.736404436573327e-06, "loss": 0.7773, "num_input_tokens_seen": 28585056, "step": 49560 }, { "epoch": 7.382335418528448, "grad_norm": 0.21805629134178162, "learning_rate": 9.731257982664196e-06, "loss": 0.7941, "num_input_tokens_seen": 28587680, "step": 49565 }, { "epoch": 7.383080131069407, "grad_norm": 0.2438477724790573, "learning_rate": 9.726112560554562e-06, "loss": 0.798, "num_input_tokens_seen": 28590272, "step": 49570 }, { "epoch": 7.383824843610366, "grad_norm": 0.1725052297115326, "learning_rate": 9.72096817059215e-06, "loss": 0.8039, "num_input_tokens_seen": 28592832, "step": 49575 }, { "epoch": 7.384569556151326, "grad_norm": 0.3390118181705475, "learning_rate": 9.715824813124582e-06, "loss": 0.8284, "num_input_tokens_seen": 28595648, "step": 49580 }, { "epoch": 7.385314268692285, "grad_norm": 0.3254803419113159, "learning_rate": 9.710682488499434e-06, "loss": 0.7732, "num_input_tokens_seen": 28598688, "step": 49585 }, { "epoch": 7.386058981233244, "grad_norm": 0.22846148908138275, "learning_rate": 9.7055411970642e-06, "loss": 0.7915, "num_input_tokens_seen": 28601600, "step": 49590 }, { "epoch": 7.386803693774203, "grad_norm": 0.21209365129470825, "learning_rate": 9.700400939166308e-06, "loss": 0.809, "num_input_tokens_seen": 28604480, "step": 49595 }, { "epoch": 7.387548406315163, "grad_norm": 0.20350533723831177, "learning_rate": 9.695261715153126e-06, "loss": 0.7885, "num_input_tokens_seen": 28607328, "step": 49600 }, { "epoch": 7.388293118856121, "grad_norm": 0.317367821931839, "learning_rate": 9.690123525371925e-06, "loss": 0.7991, "num_input_tokens_seen": 28610240, "step": 49605 }, { "epoch": 7.389037831397081, "grad_norm": 0.2783200144767761, "learning_rate": 9.68498637016993e-06, "loss": 0.8248, "num_input_tokens_seen": 28613088, "step": 49610 }, { "epoch": 7.38978254393804, "grad_norm": 0.2529085576534271, "learning_rate": 9.679850249894298e-06, "loss": 0.8034, "num_input_tokens_seen": 28616192, "step": 49615 }, { "epoch": 7.390527256478999, "grad_norm": 0.2105887085199356, "learning_rate": 9.67471516489209e-06, "loss": 0.7949, "num_input_tokens_seen": 28619136, "step": 49620 }, { "epoch": 7.391271969019958, "grad_norm": 0.17961668968200684, "learning_rate": 9.669581115510323e-06, "loss": 0.8006, "num_input_tokens_seen": 28622016, "step": 49625 }, { "epoch": 7.392016681560918, "grad_norm": 0.23686599731445312, "learning_rate": 9.664448102095939e-06, "loss": 0.78, "num_input_tokens_seen": 28625088, "step": 49630 }, { "epoch": 7.392761394101877, "grad_norm": 0.27553319931030273, "learning_rate": 9.659316124995806e-06, "loss": 0.8303, "num_input_tokens_seen": 28627872, "step": 49635 }, { "epoch": 7.393506106642836, "grad_norm": 0.2563740611076355, "learning_rate": 9.654185184556713e-06, "loss": 0.7997, "num_input_tokens_seen": 28630656, "step": 49640 }, { "epoch": 7.394250819183795, "grad_norm": 0.2008146196603775, "learning_rate": 9.649055281125394e-06, "loss": 0.8064, "num_input_tokens_seen": 28633568, "step": 49645 }, { "epoch": 7.394995531724755, "grad_norm": 0.28894492983818054, "learning_rate": 9.643926415048504e-06, "loss": 0.8332, "num_input_tokens_seen": 28636640, "step": 49650 }, { "epoch": 7.395740244265713, "grad_norm": 0.17645835876464844, "learning_rate": 9.638798586672645e-06, "loss": 0.8157, "num_input_tokens_seen": 28639264, "step": 49655 }, { "epoch": 7.396484956806672, "grad_norm": 0.2977616786956787, "learning_rate": 9.633671796344312e-06, "loss": 0.8156, "num_input_tokens_seen": 28642176, "step": 49660 }, { "epoch": 7.397229669347632, "grad_norm": 0.23913821578025818, "learning_rate": 9.628546044409966e-06, "loss": 0.7859, "num_input_tokens_seen": 28645088, "step": 49665 }, { "epoch": 7.3979743818885915, "grad_norm": 0.2239629477262497, "learning_rate": 9.623421331215992e-06, "loss": 0.7875, "num_input_tokens_seen": 28647904, "step": 49670 }, { "epoch": 7.39871909442955, "grad_norm": 0.2644181549549103, "learning_rate": 9.618297657108676e-06, "loss": 0.8193, "num_input_tokens_seen": 28650688, "step": 49675 }, { "epoch": 7.399463806970509, "grad_norm": 0.16383515298366547, "learning_rate": 9.61317502243427e-06, "loss": 0.7909, "num_input_tokens_seen": 28653536, "step": 49680 }, { "epoch": 7.400208519511469, "grad_norm": 0.1870124191045761, "learning_rate": 9.608053427538938e-06, "loss": 0.8083, "num_input_tokens_seen": 28656480, "step": 49685 }, { "epoch": 7.400953232052427, "grad_norm": 0.2120395451784134, "learning_rate": 9.602932872768775e-06, "loss": 0.807, "num_input_tokens_seen": 28659264, "step": 49690 }, { "epoch": 7.401697944593387, "grad_norm": 0.21417969465255737, "learning_rate": 9.597813358469817e-06, "loss": 0.8166, "num_input_tokens_seen": 28661920, "step": 49695 }, { "epoch": 7.402442657134346, "grad_norm": 0.16165955364704132, "learning_rate": 9.592694884988001e-06, "loss": 0.8067, "num_input_tokens_seen": 28664928, "step": 49700 }, { "epoch": 7.403187369675305, "grad_norm": 0.19609101116657257, "learning_rate": 9.587577452669235e-06, "loss": 0.7866, "num_input_tokens_seen": 28667648, "step": 49705 }, { "epoch": 7.403932082216264, "grad_norm": 0.2476574033498764, "learning_rate": 9.582461061859313e-06, "loss": 0.7803, "num_input_tokens_seen": 28670752, "step": 49710 }, { "epoch": 7.404676794757224, "grad_norm": 0.3036278784275055, "learning_rate": 9.577345712903988e-06, "loss": 0.7847, "num_input_tokens_seen": 28673920, "step": 49715 }, { "epoch": 7.405421507298183, "grad_norm": 0.24487020075321198, "learning_rate": 9.572231406148938e-06, "loss": 0.8061, "num_input_tokens_seen": 28676928, "step": 49720 }, { "epoch": 7.406166219839142, "grad_norm": 0.20110473036766052, "learning_rate": 9.567118141939763e-06, "loss": 0.786, "num_input_tokens_seen": 28679712, "step": 49725 }, { "epoch": 7.406910932380101, "grad_norm": 0.1722785085439682, "learning_rate": 9.562005920622009e-06, "loss": 0.7971, "num_input_tokens_seen": 28682816, "step": 49730 }, { "epoch": 7.407655644921061, "grad_norm": 0.26374638080596924, "learning_rate": 9.556894742541117e-06, "loss": 0.7867, "num_input_tokens_seen": 28685792, "step": 49735 }, { "epoch": 7.408400357462019, "grad_norm": 0.21376585960388184, "learning_rate": 9.551784608042501e-06, "loss": 0.8177, "num_input_tokens_seen": 28688704, "step": 49740 }, { "epoch": 7.409145070002979, "grad_norm": 0.21170422434806824, "learning_rate": 9.546675517471465e-06, "loss": 0.7665, "num_input_tokens_seen": 28691296, "step": 49745 }, { "epoch": 7.409889782543938, "grad_norm": 0.2527347803115845, "learning_rate": 9.541567471173268e-06, "loss": 0.768, "num_input_tokens_seen": 28694304, "step": 49750 }, { "epoch": 7.4106344950848975, "grad_norm": 0.21231712400913239, "learning_rate": 9.536460469493095e-06, "loss": 0.8061, "num_input_tokens_seen": 28697088, "step": 49755 }, { "epoch": 7.411379207625856, "grad_norm": 0.2392677366733551, "learning_rate": 9.53135451277605e-06, "loss": 0.7825, "num_input_tokens_seen": 28700032, "step": 49760 }, { "epoch": 7.412123920166816, "grad_norm": 0.19292111694812775, "learning_rate": 9.526249601367185e-06, "loss": 0.7949, "num_input_tokens_seen": 28702688, "step": 49765 }, { "epoch": 7.412868632707775, "grad_norm": 0.18859027326107025, "learning_rate": 9.521145735611453e-06, "loss": 0.8232, "num_input_tokens_seen": 28705536, "step": 49770 }, { "epoch": 7.413613345248734, "grad_norm": 0.26755860447883606, "learning_rate": 9.51604291585376e-06, "loss": 0.7874, "num_input_tokens_seen": 28708544, "step": 49775 }, { "epoch": 7.414358057789693, "grad_norm": 0.2697336971759796, "learning_rate": 9.510941142438939e-06, "loss": 0.7838, "num_input_tokens_seen": 28711424, "step": 49780 }, { "epoch": 7.415102770330653, "grad_norm": 0.2558048367500305, "learning_rate": 9.505840415711737e-06, "loss": 0.7796, "num_input_tokens_seen": 28714400, "step": 49785 }, { "epoch": 7.415847482871611, "grad_norm": 0.20221155881881714, "learning_rate": 9.500740736016845e-06, "loss": 0.7968, "num_input_tokens_seen": 28717376, "step": 49790 }, { "epoch": 7.416592195412571, "grad_norm": 0.28163912892341614, "learning_rate": 9.495642103698877e-06, "loss": 0.8202, "num_input_tokens_seen": 28720224, "step": 49795 }, { "epoch": 7.41733690795353, "grad_norm": 0.1800881177186966, "learning_rate": 9.490544519102387e-06, "loss": 0.8049, "num_input_tokens_seen": 28723040, "step": 49800 }, { "epoch": 7.4180816204944895, "grad_norm": 0.21105830371379852, "learning_rate": 9.485447982571832e-06, "loss": 0.8014, "num_input_tokens_seen": 28725760, "step": 49805 }, { "epoch": 7.418826333035448, "grad_norm": 0.20605647563934326, "learning_rate": 9.480352494451628e-06, "loss": 0.8079, "num_input_tokens_seen": 28728448, "step": 49810 }, { "epoch": 7.419571045576408, "grad_norm": 0.23981153964996338, "learning_rate": 9.475258055086102e-06, "loss": 0.834, "num_input_tokens_seen": 28731264, "step": 49815 }, { "epoch": 7.420315758117367, "grad_norm": 0.3338756859302521, "learning_rate": 9.470164664819527e-06, "loss": 0.8072, "num_input_tokens_seen": 28734048, "step": 49820 }, { "epoch": 7.421060470658326, "grad_norm": 0.25887852907180786, "learning_rate": 9.465072323996078e-06, "loss": 0.8223, "num_input_tokens_seen": 28737184, "step": 49825 }, { "epoch": 7.421805183199285, "grad_norm": 0.1927550733089447, "learning_rate": 9.459981032959877e-06, "loss": 0.7995, "num_input_tokens_seen": 28740128, "step": 49830 }, { "epoch": 7.422549895740245, "grad_norm": 0.19837909936904907, "learning_rate": 9.454890792054987e-06, "loss": 0.8121, "num_input_tokens_seen": 28742784, "step": 49835 }, { "epoch": 7.4232946082812035, "grad_norm": 0.206697016954422, "learning_rate": 9.44980160162537e-06, "loss": 0.8136, "num_input_tokens_seen": 28745792, "step": 49840 }, { "epoch": 7.424039320822162, "grad_norm": 0.1576482653617859, "learning_rate": 9.444713462014934e-06, "loss": 0.8011, "num_input_tokens_seen": 28748704, "step": 49845 }, { "epoch": 7.424784033363122, "grad_norm": 0.22961682081222534, "learning_rate": 9.43962637356752e-06, "loss": 0.8261, "num_input_tokens_seen": 28751552, "step": 49850 }, { "epoch": 7.425528745904081, "grad_norm": 0.1742979735136032, "learning_rate": 9.434540336626892e-06, "loss": 0.7935, "num_input_tokens_seen": 28754304, "step": 49855 }, { "epoch": 7.42627345844504, "grad_norm": 0.22454418241977692, "learning_rate": 9.429455351536754e-06, "loss": 0.8042, "num_input_tokens_seen": 28757248, "step": 49860 }, { "epoch": 7.427018170985999, "grad_norm": 0.2485467493534088, "learning_rate": 9.424371418640706e-06, "loss": 0.8002, "num_input_tokens_seen": 28760128, "step": 49865 }, { "epoch": 7.427762883526959, "grad_norm": 0.24619987607002258, "learning_rate": 9.419288538282323e-06, "loss": 0.7858, "num_input_tokens_seen": 28763264, "step": 49870 }, { "epoch": 7.428507596067917, "grad_norm": 0.2653796970844269, "learning_rate": 9.414206710805062e-06, "loss": 0.7984, "num_input_tokens_seen": 28765984, "step": 49875 }, { "epoch": 7.429252308608877, "grad_norm": 0.22849902510643005, "learning_rate": 9.409125936552349e-06, "loss": 0.8042, "num_input_tokens_seen": 28768576, "step": 49880 }, { "epoch": 7.429997021149836, "grad_norm": 0.23789562284946442, "learning_rate": 9.404046215867515e-06, "loss": 0.7924, "num_input_tokens_seen": 28771488, "step": 49885 }, { "epoch": 7.4307417336907955, "grad_norm": 0.26771873235702515, "learning_rate": 9.398967549093828e-06, "loss": 0.8189, "num_input_tokens_seen": 28774432, "step": 49890 }, { "epoch": 7.431486446231754, "grad_norm": 0.18643580377101898, "learning_rate": 9.393889936574496e-06, "loss": 0.8099, "num_input_tokens_seen": 28777184, "step": 49895 }, { "epoch": 7.432231158772714, "grad_norm": 0.20777934789657593, "learning_rate": 9.388813378652623e-06, "loss": 0.806, "num_input_tokens_seen": 28780032, "step": 49900 }, { "epoch": 7.432975871313673, "grad_norm": 0.17197448015213013, "learning_rate": 9.383737875671278e-06, "loss": 0.7942, "num_input_tokens_seen": 28782848, "step": 49905 }, { "epoch": 7.433720583854632, "grad_norm": 0.18926961719989777, "learning_rate": 9.378663427973428e-06, "loss": 0.8223, "num_input_tokens_seen": 28785760, "step": 49910 }, { "epoch": 7.434465296395591, "grad_norm": 0.269798219203949, "learning_rate": 9.373590035901993e-06, "loss": 0.7995, "num_input_tokens_seen": 28788896, "step": 49915 }, { "epoch": 7.435210008936551, "grad_norm": 0.20552900433540344, "learning_rate": 9.368517699799812e-06, "loss": 0.7852, "num_input_tokens_seen": 28791584, "step": 49920 }, { "epoch": 7.4359547214775095, "grad_norm": 0.12511955201625824, "learning_rate": 9.36344642000965e-06, "loss": 0.7964, "num_input_tokens_seen": 28794400, "step": 49925 }, { "epoch": 7.436699434018469, "grad_norm": 0.20202623307704926, "learning_rate": 9.358376196874214e-06, "loss": 0.8067, "num_input_tokens_seen": 28797536, "step": 49930 }, { "epoch": 7.437444146559428, "grad_norm": 0.424360454082489, "learning_rate": 9.353307030736113e-06, "loss": 0.7801, "num_input_tokens_seen": 28800960, "step": 49935 }, { "epoch": 7.4381888591003875, "grad_norm": 0.21358685195446014, "learning_rate": 9.348238921937916e-06, "loss": 0.778, "num_input_tokens_seen": 28804000, "step": 49940 }, { "epoch": 7.438933571641346, "grad_norm": 0.26809483766555786, "learning_rate": 9.34317187082209e-06, "loss": 0.7936, "num_input_tokens_seen": 28806784, "step": 49945 }, { "epoch": 7.439678284182306, "grad_norm": 0.18351662158966064, "learning_rate": 9.338105877731051e-06, "loss": 0.8104, "num_input_tokens_seen": 28809728, "step": 49950 }, { "epoch": 7.440422996723265, "grad_norm": 0.29614272713661194, "learning_rate": 9.33304094300714e-06, "loss": 0.8253, "num_input_tokens_seen": 28812608, "step": 49955 }, { "epoch": 7.441167709264224, "grad_norm": 0.28417396545410156, "learning_rate": 9.327977066992627e-06, "loss": 0.7735, "num_input_tokens_seen": 28815648, "step": 49960 }, { "epoch": 7.441912421805183, "grad_norm": 0.22555120289325714, "learning_rate": 9.322914250029713e-06, "loss": 0.7905, "num_input_tokens_seen": 28818336, "step": 49965 }, { "epoch": 7.442657134346143, "grad_norm": 0.19303126633167267, "learning_rate": 9.317852492460508e-06, "loss": 0.8051, "num_input_tokens_seen": 28821056, "step": 49970 }, { "epoch": 7.4434018468871015, "grad_norm": 0.2524538040161133, "learning_rate": 9.312791794627072e-06, "loss": 0.8072, "num_input_tokens_seen": 28824032, "step": 49975 }, { "epoch": 7.444146559428061, "grad_norm": 0.20765675604343414, "learning_rate": 9.307732156871393e-06, "loss": 0.7855, "num_input_tokens_seen": 28827072, "step": 49980 }, { "epoch": 7.44489127196902, "grad_norm": 0.2630591094493866, "learning_rate": 9.30267357953537e-06, "loss": 0.8053, "num_input_tokens_seen": 28830048, "step": 49985 }, { "epoch": 7.4456359845099795, "grad_norm": 0.20690996944904327, "learning_rate": 9.297616062960843e-06, "loss": 0.7906, "num_input_tokens_seen": 28832928, "step": 49990 }, { "epoch": 7.446380697050938, "grad_norm": 0.14809690415859222, "learning_rate": 9.292559607489585e-06, "loss": 0.8335, "num_input_tokens_seen": 28835808, "step": 49995 }, { "epoch": 7.447125409591898, "grad_norm": 0.18135857582092285, "learning_rate": 9.287504213463292e-06, "loss": 0.8022, "num_input_tokens_seen": 28838656, "step": 50000 }, { "epoch": 7.447870122132857, "grad_norm": 0.22962015867233276, "learning_rate": 9.282449881223573e-06, "loss": 0.7926, "num_input_tokens_seen": 28841536, "step": 50005 }, { "epoch": 7.4486148346738155, "grad_norm": 0.2996968626976013, "learning_rate": 9.27739661111199e-06, "loss": 0.8273, "num_input_tokens_seen": 28844416, "step": 50010 }, { "epoch": 7.449359547214775, "grad_norm": 0.261713445186615, "learning_rate": 9.27234440347002e-06, "loss": 0.8085, "num_input_tokens_seen": 28847328, "step": 50015 }, { "epoch": 7.450104259755734, "grad_norm": 0.19878464937210083, "learning_rate": 9.267293258639082e-06, "loss": 0.8064, "num_input_tokens_seen": 28850528, "step": 50020 }, { "epoch": 7.4508489722966935, "grad_norm": 0.2564563453197479, "learning_rate": 9.262243176960489e-06, "loss": 0.7994, "num_input_tokens_seen": 28853376, "step": 50025 }, { "epoch": 7.451593684837652, "grad_norm": 0.1876109093427658, "learning_rate": 9.257194158775517e-06, "loss": 0.7935, "num_input_tokens_seen": 28856384, "step": 50030 }, { "epoch": 7.452338397378612, "grad_norm": 0.23457908630371094, "learning_rate": 9.252146204425369e-06, "loss": 0.7898, "num_input_tokens_seen": 28859360, "step": 50035 }, { "epoch": 7.453083109919571, "grad_norm": 0.20441146194934845, "learning_rate": 9.247099314251145e-06, "loss": 0.7923, "num_input_tokens_seen": 28862016, "step": 50040 }, { "epoch": 7.45382782246053, "grad_norm": 0.22140495479106903, "learning_rate": 9.242053488593902e-06, "loss": 0.8226, "num_input_tokens_seen": 28864832, "step": 50045 }, { "epoch": 7.454572535001489, "grad_norm": 0.1574752926826477, "learning_rate": 9.237008727794618e-06, "loss": 0.7979, "num_input_tokens_seen": 28867424, "step": 50050 }, { "epoch": 7.455317247542449, "grad_norm": 0.18327029049396515, "learning_rate": 9.231965032194198e-06, "loss": 0.8103, "num_input_tokens_seen": 28870464, "step": 50055 }, { "epoch": 7.4560619600834075, "grad_norm": 0.1787380874156952, "learning_rate": 9.226922402133477e-06, "loss": 0.7921, "num_input_tokens_seen": 28873216, "step": 50060 }, { "epoch": 7.456806672624367, "grad_norm": 0.2071818709373474, "learning_rate": 9.221880837953209e-06, "loss": 0.8138, "num_input_tokens_seen": 28875968, "step": 50065 }, { "epoch": 7.457551385165326, "grad_norm": 0.21090462803840637, "learning_rate": 9.21684033999409e-06, "loss": 0.8073, "num_input_tokens_seen": 28878880, "step": 50070 }, { "epoch": 7.4582960977062855, "grad_norm": 0.2474193125963211, "learning_rate": 9.21180090859672e-06, "loss": 0.803, "num_input_tokens_seen": 28881568, "step": 50075 }, { "epoch": 7.459040810247244, "grad_norm": 0.21176229417324066, "learning_rate": 9.20676254410166e-06, "loss": 0.8074, "num_input_tokens_seen": 28884448, "step": 50080 }, { "epoch": 7.459785522788204, "grad_norm": 0.20234757661819458, "learning_rate": 9.201725246849374e-06, "loss": 0.8057, "num_input_tokens_seen": 28887136, "step": 50085 }, { "epoch": 7.460530235329163, "grad_norm": 0.1864691972732544, "learning_rate": 9.196689017180262e-06, "loss": 0.7848, "num_input_tokens_seen": 28889888, "step": 50090 }, { "epoch": 7.461274947870122, "grad_norm": 0.25760677456855774, "learning_rate": 9.191653855434667e-06, "loss": 0.805, "num_input_tokens_seen": 28892736, "step": 50095 }, { "epoch": 7.462019660411081, "grad_norm": 0.1829281747341156, "learning_rate": 9.186619761952831e-06, "loss": 0.7981, "num_input_tokens_seen": 28896000, "step": 50100 }, { "epoch": 7.462764372952041, "grad_norm": 0.20052470266819, "learning_rate": 9.181586737074932e-06, "loss": 0.8177, "num_input_tokens_seen": 28898880, "step": 50105 }, { "epoch": 7.4635090854929995, "grad_norm": 0.19732607901096344, "learning_rate": 9.176554781141086e-06, "loss": 0.8003, "num_input_tokens_seen": 28901632, "step": 50110 }, { "epoch": 7.464253798033959, "grad_norm": 0.2136654406785965, "learning_rate": 9.171523894491336e-06, "loss": 0.7906, "num_input_tokens_seen": 28904608, "step": 50115 }, { "epoch": 7.464998510574918, "grad_norm": 0.23990853130817413, "learning_rate": 9.166494077465645e-06, "loss": 0.8136, "num_input_tokens_seen": 28907456, "step": 50120 }, { "epoch": 7.465743223115878, "grad_norm": 0.21901674568653107, "learning_rate": 9.161465330403912e-06, "loss": 0.7857, "num_input_tokens_seen": 28910400, "step": 50125 }, { "epoch": 7.466487935656836, "grad_norm": 0.24505116045475006, "learning_rate": 9.156437653645966e-06, "loss": 0.8123, "num_input_tokens_seen": 28913280, "step": 50130 }, { "epoch": 7.467232648197796, "grad_norm": 0.19383950531482697, "learning_rate": 9.151411047531539e-06, "loss": 0.7862, "num_input_tokens_seen": 28916320, "step": 50135 }, { "epoch": 7.467977360738755, "grad_norm": 0.32560428977012634, "learning_rate": 9.146385512400323e-06, "loss": 0.8105, "num_input_tokens_seen": 28919520, "step": 50140 }, { "epoch": 7.468722073279714, "grad_norm": 0.2186041623353958, "learning_rate": 9.141361048591916e-06, "loss": 0.7816, "num_input_tokens_seen": 28922624, "step": 50145 }, { "epoch": 7.469466785820673, "grad_norm": 0.25556591153144836, "learning_rate": 9.136337656445849e-06, "loss": 0.7929, "num_input_tokens_seen": 28925280, "step": 50150 }, { "epoch": 7.470211498361633, "grad_norm": 0.2067762166261673, "learning_rate": 9.131315336301585e-06, "loss": 0.8097, "num_input_tokens_seen": 28928320, "step": 50155 }, { "epoch": 7.4709562109025915, "grad_norm": 0.22700071334838867, "learning_rate": 9.126294088498515e-06, "loss": 0.7831, "num_input_tokens_seen": 28931136, "step": 50160 }, { "epoch": 7.471700923443551, "grad_norm": 0.19268976151943207, "learning_rate": 9.12127391337596e-06, "loss": 0.7942, "num_input_tokens_seen": 28934240, "step": 50165 }, { "epoch": 7.47244563598451, "grad_norm": 0.24157088994979858, "learning_rate": 9.116254811273151e-06, "loss": 0.8094, "num_input_tokens_seen": 28937024, "step": 50170 }, { "epoch": 7.473190348525469, "grad_norm": 0.19200333952903748, "learning_rate": 9.111236782529259e-06, "loss": 0.816, "num_input_tokens_seen": 28940064, "step": 50175 }, { "epoch": 7.473935061066428, "grad_norm": 0.27168703079223633, "learning_rate": 9.106219827483398e-06, "loss": 0.7984, "num_input_tokens_seen": 28942976, "step": 50180 }, { "epoch": 7.474679773607388, "grad_norm": 0.178129181265831, "learning_rate": 9.101203946474571e-06, "loss": 0.8072, "num_input_tokens_seen": 28945632, "step": 50185 }, { "epoch": 7.475424486148347, "grad_norm": 0.23496538400650024, "learning_rate": 9.09618913984174e-06, "loss": 0.8238, "num_input_tokens_seen": 28948256, "step": 50190 }, { "epoch": 7.4761691986893055, "grad_norm": 0.2219366580247879, "learning_rate": 9.091175407923788e-06, "loss": 0.8282, "num_input_tokens_seen": 28951168, "step": 50195 }, { "epoch": 7.476913911230265, "grad_norm": 0.206252321600914, "learning_rate": 9.086162751059532e-06, "loss": 0.8104, "num_input_tokens_seen": 28953920, "step": 50200 }, { "epoch": 7.477658623771224, "grad_norm": 0.26720181107521057, "learning_rate": 9.081151169587686e-06, "loss": 0.8246, "num_input_tokens_seen": 28956608, "step": 50205 }, { "epoch": 7.478403336312184, "grad_norm": 0.20212167501449585, "learning_rate": 9.076140663846925e-06, "loss": 0.7794, "num_input_tokens_seen": 28959840, "step": 50210 }, { "epoch": 7.479148048853142, "grad_norm": 0.2555553615093231, "learning_rate": 9.071131234175831e-06, "loss": 0.7824, "num_input_tokens_seen": 28962624, "step": 50215 }, { "epoch": 7.479892761394102, "grad_norm": 0.187027245759964, "learning_rate": 9.066122880912938e-06, "loss": 0.7808, "num_input_tokens_seen": 28965056, "step": 50220 }, { "epoch": 7.480637473935061, "grad_norm": 0.1825553923845291, "learning_rate": 9.06111560439667e-06, "loss": 0.7832, "num_input_tokens_seen": 28967936, "step": 50225 }, { "epoch": 7.48138218647602, "grad_norm": 0.24949000775814056, "learning_rate": 9.056109404965408e-06, "loss": 0.7883, "num_input_tokens_seen": 28970816, "step": 50230 }, { "epoch": 7.482126899016979, "grad_norm": 0.23480886220932007, "learning_rate": 9.051104282957454e-06, "loss": 0.7844, "num_input_tokens_seen": 28973568, "step": 50235 }, { "epoch": 7.482871611557939, "grad_norm": 0.1941388100385666, "learning_rate": 9.046100238711021e-06, "loss": 0.7863, "num_input_tokens_seen": 28976544, "step": 50240 }, { "epoch": 7.4836163240988975, "grad_norm": 0.22090938687324524, "learning_rate": 9.041097272564275e-06, "loss": 0.7945, "num_input_tokens_seen": 28979616, "step": 50245 }, { "epoch": 7.484361036639857, "grad_norm": 0.23592627048492432, "learning_rate": 9.036095384855287e-06, "loss": 0.7945, "num_input_tokens_seen": 28982368, "step": 50250 }, { "epoch": 7.485105749180816, "grad_norm": 0.2746468484401703, "learning_rate": 9.03109457592207e-06, "loss": 0.7717, "num_input_tokens_seen": 28985216, "step": 50255 }, { "epoch": 7.485850461721776, "grad_norm": 0.32134631276130676, "learning_rate": 9.026094846102565e-06, "loss": 0.8321, "num_input_tokens_seen": 28988000, "step": 50260 }, { "epoch": 7.486595174262734, "grad_norm": 0.22913450002670288, "learning_rate": 9.021096195734625e-06, "loss": 0.7694, "num_input_tokens_seen": 28990752, "step": 50265 }, { "epoch": 7.487339886803694, "grad_norm": 0.2323499172925949, "learning_rate": 9.016098625156027e-06, "loss": 0.8092, "num_input_tokens_seen": 28994080, "step": 50270 }, { "epoch": 7.488084599344653, "grad_norm": 0.25627607107162476, "learning_rate": 9.011102134704501e-06, "loss": 0.7981, "num_input_tokens_seen": 28997088, "step": 50275 }, { "epoch": 7.488829311885612, "grad_norm": 0.27793675661087036, "learning_rate": 9.006106724717686e-06, "loss": 0.7969, "num_input_tokens_seen": 29000128, "step": 50280 }, { "epoch": 7.489574024426571, "grad_norm": 0.24817249178886414, "learning_rate": 9.001112395533153e-06, "loss": 0.7941, "num_input_tokens_seen": 29003168, "step": 50285 }, { "epoch": 7.490318736967531, "grad_norm": 0.20045670866966248, "learning_rate": 8.996119147488396e-06, "loss": 0.752, "num_input_tokens_seen": 29006240, "step": 50290 }, { "epoch": 7.49106344950849, "grad_norm": 0.24169297516345978, "learning_rate": 8.99112698092085e-06, "loss": 0.7843, "num_input_tokens_seen": 29009344, "step": 50295 }, { "epoch": 7.491808162049449, "grad_norm": 0.21159864962100983, "learning_rate": 8.986135896167856e-06, "loss": 0.8073, "num_input_tokens_seen": 29012224, "step": 50300 }, { "epoch": 7.492552874590408, "grad_norm": 0.3383975028991699, "learning_rate": 8.98114589356668e-06, "loss": 0.8159, "num_input_tokens_seen": 29015200, "step": 50305 }, { "epoch": 7.493297587131368, "grad_norm": 0.18024489283561707, "learning_rate": 8.97615697345454e-06, "loss": 0.7746, "num_input_tokens_seen": 29018048, "step": 50310 }, { "epoch": 7.494042299672326, "grad_norm": 0.20067793130874634, "learning_rate": 8.97116913616856e-06, "loss": 0.7973, "num_input_tokens_seen": 29020736, "step": 50315 }, { "epoch": 7.494787012213286, "grad_norm": 0.197018101811409, "learning_rate": 8.966182382045801e-06, "loss": 0.7853, "num_input_tokens_seen": 29023584, "step": 50320 }, { "epoch": 7.495531724754245, "grad_norm": 0.2085530310869217, "learning_rate": 8.96119671142325e-06, "loss": 0.8069, "num_input_tokens_seen": 29026336, "step": 50325 }, { "epoch": 7.496276437295204, "grad_norm": 0.2737725079059601, "learning_rate": 8.956212124637822e-06, "loss": 0.8434, "num_input_tokens_seen": 29029280, "step": 50330 }, { "epoch": 7.497021149836163, "grad_norm": 0.29799485206604004, "learning_rate": 8.951228622026344e-06, "loss": 0.826, "num_input_tokens_seen": 29032416, "step": 50335 }, { "epoch": 7.497765862377123, "grad_norm": 0.21922826766967773, "learning_rate": 8.946246203925584e-06, "loss": 0.8012, "num_input_tokens_seen": 29035552, "step": 50340 }, { "epoch": 7.498510574918082, "grad_norm": 0.21127763390541077, "learning_rate": 8.941264870672244e-06, "loss": 0.8064, "num_input_tokens_seen": 29038400, "step": 50345 }, { "epoch": 7.499255287459041, "grad_norm": 0.22947244346141815, "learning_rate": 8.936284622602927e-06, "loss": 0.8079, "num_input_tokens_seen": 29041216, "step": 50350 }, { "epoch": 7.5, "grad_norm": 0.2440834641456604, "learning_rate": 8.931305460054184e-06, "loss": 0.7807, "num_input_tokens_seen": 29044256, "step": 50355 }, { "epoch": 7.5, "eval_loss": 0.8030418157577515, "eval_runtime": 45.3189, "eval_samples_per_second": 65.845, "eval_steps_per_second": 16.461, "num_input_tokens_seen": 29044256, "step": 50355 }, { "epoch": 7.500744712540959, "grad_norm": 0.32720524072647095, "learning_rate": 8.92632738336249e-06, "loss": 0.7875, "num_input_tokens_seen": 29047264, "step": 50360 }, { "epoch": 7.501489425081918, "grad_norm": 0.25551173090934753, "learning_rate": 8.921350392864247e-06, "loss": 0.7941, "num_input_tokens_seen": 29050112, "step": 50365 }, { "epoch": 7.502234137622878, "grad_norm": 0.21119652688503265, "learning_rate": 8.916374488895766e-06, "loss": 0.7996, "num_input_tokens_seen": 29052896, "step": 50370 }, { "epoch": 7.502978850163837, "grad_norm": 0.21520748734474182, "learning_rate": 8.911399671793302e-06, "loss": 0.8282, "num_input_tokens_seen": 29055840, "step": 50375 }, { "epoch": 7.503723562704796, "grad_norm": 0.28982093930244446, "learning_rate": 8.90642594189304e-06, "loss": 0.8035, "num_input_tokens_seen": 29058528, "step": 50380 }, { "epoch": 7.504468275245755, "grad_norm": 0.256539911031723, "learning_rate": 8.90145329953109e-06, "loss": 0.7895, "num_input_tokens_seen": 29061568, "step": 50385 }, { "epoch": 7.505212987786714, "grad_norm": 0.3129029870033264, "learning_rate": 8.896481745043463e-06, "loss": 0.8477, "num_input_tokens_seen": 29064448, "step": 50390 }, { "epoch": 7.505957700327674, "grad_norm": 0.17364796996116638, "learning_rate": 8.891511278766132e-06, "loss": 0.8061, "num_input_tokens_seen": 29067136, "step": 50395 }, { "epoch": 7.506702412868632, "grad_norm": 0.1992996633052826, "learning_rate": 8.886541901034981e-06, "loss": 0.8339, "num_input_tokens_seen": 29069952, "step": 50400 }, { "epoch": 7.507447125409592, "grad_norm": 0.23249967396259308, "learning_rate": 8.88157361218581e-06, "loss": 0.8184, "num_input_tokens_seen": 29072800, "step": 50405 }, { "epoch": 7.508191837950551, "grad_norm": 0.22635957598686218, "learning_rate": 8.876606412554358e-06, "loss": 0.7924, "num_input_tokens_seen": 29075552, "step": 50410 }, { "epoch": 7.50893655049151, "grad_norm": 0.32139861583709717, "learning_rate": 8.871640302476295e-06, "loss": 0.8417, "num_input_tokens_seen": 29078528, "step": 50415 }, { "epoch": 7.509681263032469, "grad_norm": 0.17551586031913757, "learning_rate": 8.866675282287204e-06, "loss": 0.7994, "num_input_tokens_seen": 29081536, "step": 50420 }, { "epoch": 7.510425975573429, "grad_norm": 0.17927996814250946, "learning_rate": 8.861711352322616e-06, "loss": 0.8003, "num_input_tokens_seen": 29084704, "step": 50425 }, { "epoch": 7.511170688114388, "grad_norm": 0.24993085861206055, "learning_rate": 8.85674851291796e-06, "loss": 0.8006, "num_input_tokens_seen": 29087296, "step": 50430 }, { "epoch": 7.511915400655347, "grad_norm": 0.3158489167690277, "learning_rate": 8.851786764408596e-06, "loss": 0.7969, "num_input_tokens_seen": 29090592, "step": 50435 }, { "epoch": 7.512660113196306, "grad_norm": 0.2368731051683426, "learning_rate": 8.84682610712983e-06, "loss": 0.8496, "num_input_tokens_seen": 29093344, "step": 50440 }, { "epoch": 7.513404825737266, "grad_norm": 0.2628810405731201, "learning_rate": 8.841866541416883e-06, "loss": 0.7904, "num_input_tokens_seen": 29095936, "step": 50445 }, { "epoch": 7.514149538278224, "grad_norm": 0.1984676867723465, "learning_rate": 8.836908067604898e-06, "loss": 0.7979, "num_input_tokens_seen": 29098624, "step": 50450 }, { "epoch": 7.514894250819184, "grad_norm": 0.26661360263824463, "learning_rate": 8.831950686028953e-06, "loss": 0.815, "num_input_tokens_seen": 29101632, "step": 50455 }, { "epoch": 7.515638963360143, "grad_norm": 0.22010010480880737, "learning_rate": 8.826994397024055e-06, "loss": 0.7684, "num_input_tokens_seen": 29104704, "step": 50460 }, { "epoch": 7.5163836759011025, "grad_norm": 0.22162184119224548, "learning_rate": 8.82203920092512e-06, "loss": 0.8074, "num_input_tokens_seen": 29107680, "step": 50465 }, { "epoch": 7.517128388442061, "grad_norm": 0.27975964546203613, "learning_rate": 8.817085098066994e-06, "loss": 0.8005, "num_input_tokens_seen": 29110688, "step": 50470 }, { "epoch": 7.517873100983021, "grad_norm": 0.20112305879592896, "learning_rate": 8.812132088784458e-06, "loss": 0.7958, "num_input_tokens_seen": 29113408, "step": 50475 }, { "epoch": 7.51861781352398, "grad_norm": 0.22113239765167236, "learning_rate": 8.807180173412225e-06, "loss": 0.8073, "num_input_tokens_seen": 29116192, "step": 50480 }, { "epoch": 7.519362526064939, "grad_norm": 0.16928556561470032, "learning_rate": 8.802229352284919e-06, "loss": 0.7935, "num_input_tokens_seen": 29119008, "step": 50485 }, { "epoch": 7.520107238605898, "grad_norm": 0.20679841935634613, "learning_rate": 8.797279625737098e-06, "loss": 0.8096, "num_input_tokens_seen": 29121952, "step": 50490 }, { "epoch": 7.520851951146858, "grad_norm": 0.2119065374135971, "learning_rate": 8.792330994103253e-06, "loss": 0.8122, "num_input_tokens_seen": 29124704, "step": 50495 }, { "epoch": 7.521596663687816, "grad_norm": 0.2976716160774231, "learning_rate": 8.787383457717777e-06, "loss": 0.7865, "num_input_tokens_seen": 29128064, "step": 50500 }, { "epoch": 7.522341376228776, "grad_norm": 0.202882319688797, "learning_rate": 8.782437016915016e-06, "loss": 0.7851, "num_input_tokens_seen": 29130944, "step": 50505 }, { "epoch": 7.523086088769735, "grad_norm": 0.19209295511245728, "learning_rate": 8.77749167202922e-06, "loss": 0.7852, "num_input_tokens_seen": 29133984, "step": 50510 }, { "epoch": 7.5238308013106945, "grad_norm": 0.34306612610816956, "learning_rate": 8.77254742339458e-06, "loss": 0.8164, "num_input_tokens_seen": 29136672, "step": 50515 }, { "epoch": 7.524575513851653, "grad_norm": 0.20051108300685883, "learning_rate": 8.767604271345209e-06, "loss": 0.8021, "num_input_tokens_seen": 29139616, "step": 50520 }, { "epoch": 7.525320226392612, "grad_norm": 0.20569443702697754, "learning_rate": 8.762662216215146e-06, "loss": 0.8017, "num_input_tokens_seen": 29142304, "step": 50525 }, { "epoch": 7.526064938933572, "grad_norm": 0.23036561906337738, "learning_rate": 8.75772125833836e-06, "loss": 0.7926, "num_input_tokens_seen": 29145184, "step": 50530 }, { "epoch": 7.526809651474531, "grad_norm": 0.26949819922447205, "learning_rate": 8.752781398048732e-06, "loss": 0.7877, "num_input_tokens_seen": 29148160, "step": 50535 }, { "epoch": 7.52755436401549, "grad_norm": 0.22584941983222961, "learning_rate": 8.747842635680076e-06, "loss": 0.8056, "num_input_tokens_seen": 29150848, "step": 50540 }, { "epoch": 7.528299076556449, "grad_norm": 0.15875378251075745, "learning_rate": 8.742904971566148e-06, "loss": 0.8169, "num_input_tokens_seen": 29153696, "step": 50545 }, { "epoch": 7.5290437890974085, "grad_norm": 0.16347110271453857, "learning_rate": 8.737968406040597e-06, "loss": 0.7895, "num_input_tokens_seen": 29156544, "step": 50550 }, { "epoch": 7.529788501638367, "grad_norm": 0.24701473116874695, "learning_rate": 8.733032939437025e-06, "loss": 0.8136, "num_input_tokens_seen": 29159552, "step": 50555 }, { "epoch": 7.530533214179327, "grad_norm": 0.22589856386184692, "learning_rate": 8.72809857208895e-06, "loss": 0.799, "num_input_tokens_seen": 29162592, "step": 50560 }, { "epoch": 7.531277926720286, "grad_norm": 0.18419288098812103, "learning_rate": 8.723165304329825e-06, "loss": 0.813, "num_input_tokens_seen": 29165600, "step": 50565 }, { "epoch": 7.532022639261245, "grad_norm": 0.23294395208358765, "learning_rate": 8.718233136493004e-06, "loss": 0.8019, "num_input_tokens_seen": 29168544, "step": 50570 }, { "epoch": 7.532767351802204, "grad_norm": 0.1905583292245865, "learning_rate": 8.71330206891179e-06, "loss": 0.8029, "num_input_tokens_seen": 29171328, "step": 50575 }, { "epoch": 7.533512064343164, "grad_norm": 0.2097007781267166, "learning_rate": 8.708372101919407e-06, "loss": 0.8027, "num_input_tokens_seen": 29174336, "step": 50580 }, { "epoch": 7.534256776884122, "grad_norm": 0.21964909136295319, "learning_rate": 8.703443235849007e-06, "loss": 0.7905, "num_input_tokens_seen": 29176992, "step": 50585 }, { "epoch": 7.535001489425082, "grad_norm": 0.232695534825325, "learning_rate": 8.698515471033649e-06, "loss": 0.8278, "num_input_tokens_seen": 29179520, "step": 50590 }, { "epoch": 7.535746201966041, "grad_norm": 0.16670957207679749, "learning_rate": 8.693588807806346e-06, "loss": 0.7725, "num_input_tokens_seen": 29182112, "step": 50595 }, { "epoch": 7.5364909145070005, "grad_norm": 0.2536585330963135, "learning_rate": 8.688663246500005e-06, "loss": 0.8197, "num_input_tokens_seen": 29185216, "step": 50600 }, { "epoch": 7.537235627047959, "grad_norm": 0.1634407639503479, "learning_rate": 8.683738787447488e-06, "loss": 0.8002, "num_input_tokens_seen": 29188096, "step": 50605 }, { "epoch": 7.537980339588919, "grad_norm": 0.2068939208984375, "learning_rate": 8.678815430981563e-06, "loss": 0.7846, "num_input_tokens_seen": 29190976, "step": 50610 }, { "epoch": 7.538725052129878, "grad_norm": 0.20556944608688354, "learning_rate": 8.673893177434936e-06, "loss": 0.8003, "num_input_tokens_seen": 29193632, "step": 50615 }, { "epoch": 7.539469764670837, "grad_norm": 0.22762925922870636, "learning_rate": 8.668972027140231e-06, "loss": 0.8193, "num_input_tokens_seen": 29196576, "step": 50620 }, { "epoch": 7.540214477211796, "grad_norm": 0.1799624264240265, "learning_rate": 8.664051980430008e-06, "loss": 0.7782, "num_input_tokens_seen": 29199456, "step": 50625 }, { "epoch": 7.540959189752756, "grad_norm": 0.1938038468360901, "learning_rate": 8.659133037636732e-06, "loss": 0.8167, "num_input_tokens_seen": 29202272, "step": 50630 }, { "epoch": 7.5417039022937145, "grad_norm": 0.32869523763656616, "learning_rate": 8.654215199092804e-06, "loss": 0.8153, "num_input_tokens_seen": 29205440, "step": 50635 }, { "epoch": 7.542448614834674, "grad_norm": 0.29769521951675415, "learning_rate": 8.649298465130553e-06, "loss": 0.7893, "num_input_tokens_seen": 29208224, "step": 50640 }, { "epoch": 7.543193327375633, "grad_norm": 0.163558229804039, "learning_rate": 8.644382836082235e-06, "loss": 0.7975, "num_input_tokens_seen": 29211168, "step": 50645 }, { "epoch": 7.5439380399165925, "grad_norm": 0.22189606726169586, "learning_rate": 8.63946831228003e-06, "loss": 0.7729, "num_input_tokens_seen": 29214176, "step": 50650 }, { "epoch": 7.544682752457551, "grad_norm": 0.24304848909378052, "learning_rate": 8.634554894056038e-06, "loss": 0.804, "num_input_tokens_seen": 29216832, "step": 50655 }, { "epoch": 7.545427464998511, "grad_norm": 0.2662397027015686, "learning_rate": 8.629642581742295e-06, "loss": 0.7934, "num_input_tokens_seen": 29219680, "step": 50660 }, { "epoch": 7.54617217753947, "grad_norm": 0.20483042299747467, "learning_rate": 8.624731375670752e-06, "loss": 0.8013, "num_input_tokens_seen": 29222400, "step": 50665 }, { "epoch": 7.546916890080429, "grad_norm": 0.19429540634155273, "learning_rate": 8.619821276173279e-06, "loss": 0.8007, "num_input_tokens_seen": 29225408, "step": 50670 }, { "epoch": 7.547661602621388, "grad_norm": 0.22421878576278687, "learning_rate": 8.614912283581683e-06, "loss": 0.8174, "num_input_tokens_seen": 29228000, "step": 50675 }, { "epoch": 7.548406315162348, "grad_norm": 0.1752859652042389, "learning_rate": 8.6100043982277e-06, "loss": 0.8092, "num_input_tokens_seen": 29230592, "step": 50680 }, { "epoch": 7.5491510277033065, "grad_norm": 0.24336712062358856, "learning_rate": 8.605097620442984e-06, "loss": 0.7796, "num_input_tokens_seen": 29233696, "step": 50685 }, { "epoch": 7.549895740244265, "grad_norm": 0.3146360516548157, "learning_rate": 8.600191950559111e-06, "loss": 0.7852, "num_input_tokens_seen": 29236576, "step": 50690 }, { "epoch": 7.550640452785225, "grad_norm": 0.21440957486629486, "learning_rate": 8.5952873889076e-06, "loss": 0.8091, "num_input_tokens_seen": 29239392, "step": 50695 }, { "epoch": 7.5513851653261845, "grad_norm": 0.2335035353899002, "learning_rate": 8.590383935819862e-06, "loss": 0.7925, "num_input_tokens_seen": 29242016, "step": 50700 }, { "epoch": 7.552129877867143, "grad_norm": 0.21412275731563568, "learning_rate": 8.58548159162727e-06, "loss": 0.8101, "num_input_tokens_seen": 29244640, "step": 50705 }, { "epoch": 7.552874590408102, "grad_norm": 0.2413841336965561, "learning_rate": 8.580580356661085e-06, "loss": 0.796, "num_input_tokens_seen": 29247360, "step": 50710 }, { "epoch": 7.553619302949062, "grad_norm": 0.28627902269363403, "learning_rate": 8.575680231252526e-06, "loss": 0.8068, "num_input_tokens_seen": 29250080, "step": 50715 }, { "epoch": 7.554364015490021, "grad_norm": 0.2954673767089844, "learning_rate": 8.570781215732718e-06, "loss": 0.8146, "num_input_tokens_seen": 29252704, "step": 50720 }, { "epoch": 7.55510872803098, "grad_norm": 0.1754736751317978, "learning_rate": 8.565883310432717e-06, "loss": 0.7956, "num_input_tokens_seen": 29255840, "step": 50725 }, { "epoch": 7.555853440571939, "grad_norm": 0.20038586854934692, "learning_rate": 8.56098651568352e-06, "loss": 0.7792, "num_input_tokens_seen": 29258592, "step": 50730 }, { "epoch": 7.5565981531128985, "grad_norm": 0.2225557267665863, "learning_rate": 8.556090831816006e-06, "loss": 0.809, "num_input_tokens_seen": 29261376, "step": 50735 }, { "epoch": 7.557342865653857, "grad_norm": 0.19233781099319458, "learning_rate": 8.551196259161017e-06, "loss": 0.7857, "num_input_tokens_seen": 29264448, "step": 50740 }, { "epoch": 7.558087578194817, "grad_norm": 0.18676859140396118, "learning_rate": 8.546302798049319e-06, "loss": 0.8057, "num_input_tokens_seen": 29267232, "step": 50745 }, { "epoch": 7.558832290735776, "grad_norm": 0.2664411962032318, "learning_rate": 8.541410448811574e-06, "loss": 0.7747, "num_input_tokens_seen": 29269920, "step": 50750 }, { "epoch": 7.559577003276735, "grad_norm": 0.1797255277633667, "learning_rate": 8.536519211778393e-06, "loss": 0.788, "num_input_tokens_seen": 29272640, "step": 50755 }, { "epoch": 7.560321715817694, "grad_norm": 0.22301135957241058, "learning_rate": 8.531629087280319e-06, "loss": 0.8109, "num_input_tokens_seen": 29275808, "step": 50760 }, { "epoch": 7.561066428358654, "grad_norm": 0.2146231234073639, "learning_rate": 8.526740075647784e-06, "loss": 0.7961, "num_input_tokens_seen": 29278624, "step": 50765 }, { "epoch": 7.5618111408996125, "grad_norm": 0.30567365884780884, "learning_rate": 8.52185217721118e-06, "loss": 0.8415, "num_input_tokens_seen": 29281504, "step": 50770 }, { "epoch": 7.562555853440572, "grad_norm": 0.24554090201854706, "learning_rate": 8.516965392300813e-06, "loss": 0.7915, "num_input_tokens_seen": 29284032, "step": 50775 }, { "epoch": 7.563300565981531, "grad_norm": 0.19791030883789062, "learning_rate": 8.512079721246907e-06, "loss": 0.7823, "num_input_tokens_seen": 29287136, "step": 50780 }, { "epoch": 7.5640452785224905, "grad_norm": 0.1912860870361328, "learning_rate": 8.50719516437963e-06, "loss": 0.7986, "num_input_tokens_seen": 29289920, "step": 50785 }, { "epoch": 7.564789991063449, "grad_norm": 0.2227986752986908, "learning_rate": 8.502311722029038e-06, "loss": 0.7842, "num_input_tokens_seen": 29292832, "step": 50790 }, { "epoch": 7.565534703604409, "grad_norm": 0.22707925736904144, "learning_rate": 8.497429394525155e-06, "loss": 0.8125, "num_input_tokens_seen": 29295840, "step": 50795 }, { "epoch": 7.566279416145368, "grad_norm": 0.21433165669441223, "learning_rate": 8.49254818219789e-06, "loss": 0.7841, "num_input_tokens_seen": 29298624, "step": 50800 }, { "epoch": 7.567024128686327, "grad_norm": 0.24749338626861572, "learning_rate": 8.487668085377104e-06, "loss": 0.8013, "num_input_tokens_seen": 29301472, "step": 50805 }, { "epoch": 7.567768841227286, "grad_norm": 0.20567339658737183, "learning_rate": 8.482789104392575e-06, "loss": 0.7794, "num_input_tokens_seen": 29304160, "step": 50810 }, { "epoch": 7.568513553768246, "grad_norm": 0.26723015308380127, "learning_rate": 8.477911239574005e-06, "loss": 0.7823, "num_input_tokens_seen": 29307520, "step": 50815 }, { "epoch": 7.5692582663092045, "grad_norm": 0.15059079229831696, "learning_rate": 8.473034491251016e-06, "loss": 0.8042, "num_input_tokens_seen": 29310400, "step": 50820 }, { "epoch": 7.570002978850164, "grad_norm": 0.19366347789764404, "learning_rate": 8.468158859753175e-06, "loss": 0.8254, "num_input_tokens_seen": 29313056, "step": 50825 }, { "epoch": 7.570747691391123, "grad_norm": 0.14725123345851898, "learning_rate": 8.463284345409941e-06, "loss": 0.8006, "num_input_tokens_seen": 29316096, "step": 50830 }, { "epoch": 7.571492403932083, "grad_norm": 0.17413313686847687, "learning_rate": 8.458410948550713e-06, "loss": 0.7985, "num_input_tokens_seen": 29319072, "step": 50835 }, { "epoch": 7.572237116473041, "grad_norm": 0.21326778829097748, "learning_rate": 8.453538669504818e-06, "loss": 0.8031, "num_input_tokens_seen": 29322080, "step": 50840 }, { "epoch": 7.572981829014001, "grad_norm": 0.2789587080478668, "learning_rate": 8.448667508601505e-06, "loss": 0.7812, "num_input_tokens_seen": 29324992, "step": 50845 }, { "epoch": 7.57372654155496, "grad_norm": 0.33287903666496277, "learning_rate": 8.44379746616995e-06, "loss": 0.797, "num_input_tokens_seen": 29327872, "step": 50850 }, { "epoch": 7.5744712540959185, "grad_norm": 0.1990203857421875, "learning_rate": 8.438928542539251e-06, "loss": 0.7874, "num_input_tokens_seen": 29330816, "step": 50855 }, { "epoch": 7.575215966636878, "grad_norm": 0.2478507161140442, "learning_rate": 8.434060738038438e-06, "loss": 0.8013, "num_input_tokens_seen": 29333696, "step": 50860 }, { "epoch": 7.575960679177838, "grad_norm": 0.24363888800144196, "learning_rate": 8.429194052996445e-06, "loss": 0.8064, "num_input_tokens_seen": 29336480, "step": 50865 }, { "epoch": 7.5767053917187965, "grad_norm": 0.28698229789733887, "learning_rate": 8.424328487742139e-06, "loss": 0.7926, "num_input_tokens_seen": 29339104, "step": 50870 }, { "epoch": 7.577450104259755, "grad_norm": 0.3303849995136261, "learning_rate": 8.419464042604322e-06, "loss": 0.7796, "num_input_tokens_seen": 29341984, "step": 50875 }, { "epoch": 7.578194816800715, "grad_norm": 0.3158569931983948, "learning_rate": 8.414600717911713e-06, "loss": 0.8307, "num_input_tokens_seen": 29345024, "step": 50880 }, { "epoch": 7.578939529341675, "grad_norm": 0.2045890837907791, "learning_rate": 8.409738513992958e-06, "loss": 0.7702, "num_input_tokens_seen": 29347808, "step": 50885 }, { "epoch": 7.579684241882633, "grad_norm": 0.2392076998949051, "learning_rate": 8.404877431176621e-06, "loss": 0.7667, "num_input_tokens_seen": 29351008, "step": 50890 }, { "epoch": 7.580428954423592, "grad_norm": 0.2990080714225769, "learning_rate": 8.400017469791206e-06, "loss": 0.826, "num_input_tokens_seen": 29353760, "step": 50895 }, { "epoch": 7.581173666964552, "grad_norm": 0.2555311322212219, "learning_rate": 8.395158630165112e-06, "loss": 0.7694, "num_input_tokens_seen": 29356736, "step": 50900 }, { "epoch": 7.5819183795055105, "grad_norm": 0.2105264961719513, "learning_rate": 8.390300912626686e-06, "loss": 0.796, "num_input_tokens_seen": 29359200, "step": 50905 }, { "epoch": 7.58266309204647, "grad_norm": 0.19632552564144135, "learning_rate": 8.385444317504201e-06, "loss": 0.8025, "num_input_tokens_seen": 29362112, "step": 50910 }, { "epoch": 7.583407804587429, "grad_norm": 0.2707816958427429, "learning_rate": 8.380588845125833e-06, "loss": 0.7825, "num_input_tokens_seen": 29365184, "step": 50915 }, { "epoch": 7.584152517128389, "grad_norm": 0.1630125492811203, "learning_rate": 8.3757344958197e-06, "loss": 0.8184, "num_input_tokens_seen": 29368192, "step": 50920 }, { "epoch": 7.584897229669347, "grad_norm": 0.3221820592880249, "learning_rate": 8.370881269913851e-06, "loss": 0.8267, "num_input_tokens_seen": 29370976, "step": 50925 }, { "epoch": 7.585641942210307, "grad_norm": 0.2207205593585968, "learning_rate": 8.366029167736227e-06, "loss": 0.7774, "num_input_tokens_seen": 29374048, "step": 50930 }, { "epoch": 7.586386654751266, "grad_norm": 0.2039846032857895, "learning_rate": 8.361178189614724e-06, "loss": 0.8024, "num_input_tokens_seen": 29376832, "step": 50935 }, { "epoch": 7.587131367292225, "grad_norm": 0.22232559323310852, "learning_rate": 8.356328335877147e-06, "loss": 0.81, "num_input_tokens_seen": 29379968, "step": 50940 }, { "epoch": 7.587876079833184, "grad_norm": 0.17935343086719513, "learning_rate": 8.351479606851236e-06, "loss": 0.7948, "num_input_tokens_seen": 29382752, "step": 50945 }, { "epoch": 7.588620792374144, "grad_norm": 0.32100141048431396, "learning_rate": 8.346632002864655e-06, "loss": 0.8027, "num_input_tokens_seen": 29385728, "step": 50950 }, { "epoch": 7.5893655049151025, "grad_norm": 0.22656536102294922, "learning_rate": 8.341785524244964e-06, "loss": 0.8051, "num_input_tokens_seen": 29388640, "step": 50955 }, { "epoch": 7.590110217456062, "grad_norm": 0.24753493070602417, "learning_rate": 8.33694017131969e-06, "loss": 0.7776, "num_input_tokens_seen": 29391616, "step": 50960 }, { "epoch": 7.590854929997021, "grad_norm": 0.2671663761138916, "learning_rate": 8.332095944416243e-06, "loss": 0.8201, "num_input_tokens_seen": 29394400, "step": 50965 }, { "epoch": 7.591599642537981, "grad_norm": 0.22962898015975952, "learning_rate": 8.327252843861986e-06, "loss": 0.8182, "num_input_tokens_seen": 29397216, "step": 50970 }, { "epoch": 7.592344355078939, "grad_norm": 0.22284413874149323, "learning_rate": 8.322410869984195e-06, "loss": 0.8224, "num_input_tokens_seen": 29400064, "step": 50975 }, { "epoch": 7.593089067619899, "grad_norm": 0.19179746508598328, "learning_rate": 8.317570023110072e-06, "loss": 0.8087, "num_input_tokens_seen": 29402944, "step": 50980 }, { "epoch": 7.593833780160858, "grad_norm": 0.2694738209247589, "learning_rate": 8.312730303566738e-06, "loss": 0.7749, "num_input_tokens_seen": 29405632, "step": 50985 }, { "epoch": 7.594578492701817, "grad_norm": 0.3311753273010254, "learning_rate": 8.307891711681257e-06, "loss": 0.8349, "num_input_tokens_seen": 29408800, "step": 50990 }, { "epoch": 7.595323205242776, "grad_norm": 0.21087174117565155, "learning_rate": 8.303054247780587e-06, "loss": 0.7929, "num_input_tokens_seen": 29411936, "step": 50995 }, { "epoch": 7.596067917783736, "grad_norm": 0.18981610238552094, "learning_rate": 8.298217912191617e-06, "loss": 0.8005, "num_input_tokens_seen": 29414752, "step": 51000 }, { "epoch": 7.596812630324695, "grad_norm": 0.15431858599185944, "learning_rate": 8.293382705241177e-06, "loss": 0.8014, "num_input_tokens_seen": 29417696, "step": 51005 }, { "epoch": 7.597557342865654, "grad_norm": 0.18581020832061768, "learning_rate": 8.28854862725601e-06, "loss": 0.8031, "num_input_tokens_seen": 29420544, "step": 51010 }, { "epoch": 7.598302055406613, "grad_norm": 0.21929438412189484, "learning_rate": 8.283715678562781e-06, "loss": 0.8165, "num_input_tokens_seen": 29423232, "step": 51015 }, { "epoch": 7.599046767947573, "grad_norm": 0.21859854459762573, "learning_rate": 8.278883859488085e-06, "loss": 0.7898, "num_input_tokens_seen": 29426432, "step": 51020 }, { "epoch": 7.599791480488531, "grad_norm": 0.20667089521884918, "learning_rate": 8.274053170358442e-06, "loss": 0.8061, "num_input_tokens_seen": 29429376, "step": 51025 }, { "epoch": 7.600536193029491, "grad_norm": 0.21613207459449768, "learning_rate": 8.269223611500285e-06, "loss": 0.7918, "num_input_tokens_seen": 29432192, "step": 51030 }, { "epoch": 7.60128090557045, "grad_norm": 0.26938652992248535, "learning_rate": 8.264395183239962e-06, "loss": 0.7823, "num_input_tokens_seen": 29435168, "step": 51035 }, { "epoch": 7.6020256181114085, "grad_norm": 0.15713456273078918, "learning_rate": 8.259567885903775e-06, "loss": 0.8138, "num_input_tokens_seen": 29437952, "step": 51040 }, { "epoch": 7.602770330652368, "grad_norm": 0.2796533405780792, "learning_rate": 8.254741719817924e-06, "loss": 0.7946, "num_input_tokens_seen": 29441056, "step": 51045 }, { "epoch": 7.603515043193328, "grad_norm": 0.21195319294929504, "learning_rate": 8.249916685308548e-06, "loss": 0.7763, "num_input_tokens_seen": 29443936, "step": 51050 }, { "epoch": 7.604259755734287, "grad_norm": 0.24779152870178223, "learning_rate": 8.245092782701703e-06, "loss": 0.7635, "num_input_tokens_seen": 29446784, "step": 51055 }, { "epoch": 7.605004468275245, "grad_norm": 0.2093939334154129, "learning_rate": 8.240270012323375e-06, "loss": 0.812, "num_input_tokens_seen": 29449792, "step": 51060 }, { "epoch": 7.605749180816205, "grad_norm": 0.3426380157470703, "learning_rate": 8.23544837449945e-06, "loss": 0.7935, "num_input_tokens_seen": 29453024, "step": 51065 }, { "epoch": 7.606493893357164, "grad_norm": 0.22764651477336884, "learning_rate": 8.230627869555775e-06, "loss": 0.7785, "num_input_tokens_seen": 29455680, "step": 51070 }, { "epoch": 7.607238605898123, "grad_norm": 0.23688189685344696, "learning_rate": 8.225808497818077e-06, "loss": 0.8019, "num_input_tokens_seen": 29458560, "step": 51075 }, { "epoch": 7.607983318439082, "grad_norm": 0.21194998919963837, "learning_rate": 8.220990259612043e-06, "loss": 0.8167, "num_input_tokens_seen": 29461856, "step": 51080 }, { "epoch": 7.608728030980042, "grad_norm": 0.18154826760292053, "learning_rate": 8.216173155263271e-06, "loss": 0.8276, "num_input_tokens_seen": 29464768, "step": 51085 }, { "epoch": 7.609472743521001, "grad_norm": 0.16795632243156433, "learning_rate": 8.211357185097285e-06, "loss": 0.8028, "num_input_tokens_seen": 29467680, "step": 51090 }, { "epoch": 7.61021745606196, "grad_norm": 0.19005654752254486, "learning_rate": 8.206542349439517e-06, "loss": 0.8019, "num_input_tokens_seen": 29470368, "step": 51095 }, { "epoch": 7.610962168602919, "grad_norm": 0.19657015800476074, "learning_rate": 8.20172864861534e-06, "loss": 0.8036, "num_input_tokens_seen": 29473120, "step": 51100 }, { "epoch": 7.611706881143879, "grad_norm": 0.24259541928768158, "learning_rate": 8.19691608295004e-06, "loss": 0.7903, "num_input_tokens_seen": 29476064, "step": 51105 }, { "epoch": 7.612451593684837, "grad_norm": 0.22535823285579681, "learning_rate": 8.192104652768848e-06, "loss": 0.7804, "num_input_tokens_seen": 29479232, "step": 51110 }, { "epoch": 7.613196306225797, "grad_norm": 0.20240949094295502, "learning_rate": 8.187294358396874e-06, "loss": 0.8203, "num_input_tokens_seen": 29482048, "step": 51115 }, { "epoch": 7.613941018766756, "grad_norm": 0.21939869225025177, "learning_rate": 8.182485200159195e-06, "loss": 0.7865, "num_input_tokens_seen": 29484672, "step": 51120 }, { "epoch": 7.614685731307715, "grad_norm": 0.3652513325214386, "learning_rate": 8.177677178380799e-06, "loss": 0.8391, "num_input_tokens_seen": 29487360, "step": 51125 }, { "epoch": 7.615430443848674, "grad_norm": 0.2632366418838501, "learning_rate": 8.172870293386579e-06, "loss": 0.7997, "num_input_tokens_seen": 29489696, "step": 51130 }, { "epoch": 7.616175156389634, "grad_norm": 0.21263207495212555, "learning_rate": 8.168064545501367e-06, "loss": 0.7889, "num_input_tokens_seen": 29492512, "step": 51135 }, { "epoch": 7.616919868930593, "grad_norm": 0.26327434182167053, "learning_rate": 8.16325993504992e-06, "loss": 0.8085, "num_input_tokens_seen": 29495296, "step": 51140 }, { "epoch": 7.617664581471552, "grad_norm": 0.21144743263721466, "learning_rate": 8.158456462356915e-06, "loss": 0.8054, "num_input_tokens_seen": 29498176, "step": 51145 }, { "epoch": 7.618409294012511, "grad_norm": 0.2229897379875183, "learning_rate": 8.153654127746957e-06, "loss": 0.8046, "num_input_tokens_seen": 29500832, "step": 51150 }, { "epoch": 7.619154006553471, "grad_norm": 0.22175632417201996, "learning_rate": 8.148852931544551e-06, "loss": 0.8052, "num_input_tokens_seen": 29503616, "step": 51155 }, { "epoch": 7.619898719094429, "grad_norm": 0.24522899091243744, "learning_rate": 8.144052874074162e-06, "loss": 0.801, "num_input_tokens_seen": 29506496, "step": 51160 }, { "epoch": 7.620643431635389, "grad_norm": 0.1927705556154251, "learning_rate": 8.139253955660139e-06, "loss": 0.8091, "num_input_tokens_seen": 29508928, "step": 51165 }, { "epoch": 7.621388144176348, "grad_norm": 0.14224712550640106, "learning_rate": 8.134456176626784e-06, "loss": 0.8168, "num_input_tokens_seen": 29511840, "step": 51170 }, { "epoch": 7.6221328567173074, "grad_norm": 0.34851914644241333, "learning_rate": 8.129659537298308e-06, "loss": 0.8122, "num_input_tokens_seen": 29514752, "step": 51175 }, { "epoch": 7.622877569258266, "grad_norm": 0.17608031630516052, "learning_rate": 8.124864037998852e-06, "loss": 0.8561, "num_input_tokens_seen": 29517376, "step": 51180 }, { "epoch": 7.623622281799226, "grad_norm": 0.20441189408302307, "learning_rate": 8.120069679052477e-06, "loss": 0.7991, "num_input_tokens_seen": 29520320, "step": 51185 }, { "epoch": 7.624366994340185, "grad_norm": 0.16811105608940125, "learning_rate": 8.115276460783172e-06, "loss": 0.8293, "num_input_tokens_seen": 29523200, "step": 51190 }, { "epoch": 7.625111706881144, "grad_norm": 0.17046648263931274, "learning_rate": 8.110484383514835e-06, "loss": 0.7931, "num_input_tokens_seen": 29525984, "step": 51195 }, { "epoch": 7.625856419422103, "grad_norm": 0.24537576735019684, "learning_rate": 8.105693447571286e-06, "loss": 0.7957, "num_input_tokens_seen": 29528992, "step": 51200 }, { "epoch": 7.626601131963062, "grad_norm": 0.2322234958410263, "learning_rate": 8.100903653276287e-06, "loss": 0.8394, "num_input_tokens_seen": 29531712, "step": 51205 }, { "epoch": 7.627345844504021, "grad_norm": 0.21367047727108002, "learning_rate": 8.096115000953513e-06, "loss": 0.8008, "num_input_tokens_seen": 29534432, "step": 51210 }, { "epoch": 7.628090557044981, "grad_norm": 0.2127552628517151, "learning_rate": 8.091327490926561e-06, "loss": 0.7936, "num_input_tokens_seen": 29537248, "step": 51215 }, { "epoch": 7.62883526958594, "grad_norm": 0.2056271880865097, "learning_rate": 8.08654112351895e-06, "loss": 0.8066, "num_input_tokens_seen": 29540096, "step": 51220 }, { "epoch": 7.629579982126899, "grad_norm": 0.2201310098171234, "learning_rate": 8.081755899054136e-06, "loss": 0.7936, "num_input_tokens_seen": 29543040, "step": 51225 }, { "epoch": 7.630324694667858, "grad_norm": 0.22228160500526428, "learning_rate": 8.076971817855472e-06, "loss": 0.8158, "num_input_tokens_seen": 29545952, "step": 51230 }, { "epoch": 7.631069407208818, "grad_norm": 0.3012636601924896, "learning_rate": 8.07218888024624e-06, "loss": 0.8063, "num_input_tokens_seen": 29549024, "step": 51235 }, { "epoch": 7.631814119749777, "grad_norm": 0.21914538741111755, "learning_rate": 8.067407086549661e-06, "loss": 0.807, "num_input_tokens_seen": 29552064, "step": 51240 }, { "epoch": 7.632558832290735, "grad_norm": 0.2015630602836609, "learning_rate": 8.06262643708887e-06, "loss": 0.7869, "num_input_tokens_seen": 29554944, "step": 51245 }, { "epoch": 7.633303544831695, "grad_norm": 0.1855161339044571, "learning_rate": 8.05784693218692e-06, "loss": 0.7991, "num_input_tokens_seen": 29557920, "step": 51250 }, { "epoch": 7.634048257372654, "grad_norm": 0.26194247603416443, "learning_rate": 8.053068572166797e-06, "loss": 0.8151, "num_input_tokens_seen": 29560736, "step": 51255 }, { "epoch": 7.6347929699136134, "grad_norm": 0.19979330897331238, "learning_rate": 8.048291357351395e-06, "loss": 0.8053, "num_input_tokens_seen": 29563680, "step": 51260 }, { "epoch": 7.635537682454572, "grad_norm": 0.23859131336212158, "learning_rate": 8.043515288063542e-06, "loss": 0.8102, "num_input_tokens_seen": 29566528, "step": 51265 }, { "epoch": 7.636282394995532, "grad_norm": 0.1723155528306961, "learning_rate": 8.038740364625994e-06, "loss": 0.8245, "num_input_tokens_seen": 29569344, "step": 51270 }, { "epoch": 7.637027107536491, "grad_norm": 0.1833619773387909, "learning_rate": 8.033966587361402e-06, "loss": 0.7912, "num_input_tokens_seen": 29572448, "step": 51275 }, { "epoch": 7.63777182007745, "grad_norm": 0.1739194691181183, "learning_rate": 8.029193956592371e-06, "loss": 0.7926, "num_input_tokens_seen": 29575040, "step": 51280 }, { "epoch": 7.638516532618409, "grad_norm": 0.5683863759040833, "learning_rate": 8.024422472641416e-06, "loss": 0.7993, "num_input_tokens_seen": 29578272, "step": 51285 }, { "epoch": 7.639261245159369, "grad_norm": 0.19854013621807098, "learning_rate": 8.01965213583098e-06, "loss": 0.7936, "num_input_tokens_seen": 29581056, "step": 51290 }, { "epoch": 7.640005957700327, "grad_norm": 0.20768329501152039, "learning_rate": 8.014882946483403e-06, "loss": 0.7995, "num_input_tokens_seen": 29583808, "step": 51295 }, { "epoch": 7.640750670241287, "grad_norm": 0.21254074573516846, "learning_rate": 8.010114904920984e-06, "loss": 0.783, "num_input_tokens_seen": 29586848, "step": 51300 }, { "epoch": 7.641495382782246, "grad_norm": 0.2112930566072464, "learning_rate": 8.005348011465925e-06, "loss": 0.8053, "num_input_tokens_seen": 29589600, "step": 51305 }, { "epoch": 7.6422400953232055, "grad_norm": 0.1923830509185791, "learning_rate": 8.000582266440356e-06, "loss": 0.7977, "num_input_tokens_seen": 29592480, "step": 51310 }, { "epoch": 7.642984807864164, "grad_norm": 0.2386922538280487, "learning_rate": 7.995817670166319e-06, "loss": 0.794, "num_input_tokens_seen": 29595488, "step": 51315 }, { "epoch": 7.643729520405124, "grad_norm": 0.27062779664993286, "learning_rate": 7.991054222965788e-06, "loss": 0.8085, "num_input_tokens_seen": 29598432, "step": 51320 }, { "epoch": 7.644474232946083, "grad_norm": 0.3538072407245636, "learning_rate": 7.986291925160668e-06, "loss": 0.8022, "num_input_tokens_seen": 29601312, "step": 51325 }, { "epoch": 7.645218945487042, "grad_norm": 0.2148997038602829, "learning_rate": 7.98153077707276e-06, "loss": 0.7889, "num_input_tokens_seen": 29604032, "step": 51330 }, { "epoch": 7.645963658028001, "grad_norm": 0.18306970596313477, "learning_rate": 7.976770779023807e-06, "loss": 0.783, "num_input_tokens_seen": 29606752, "step": 51335 }, { "epoch": 7.646708370568961, "grad_norm": 0.19060266017913818, "learning_rate": 7.972011931335474e-06, "loss": 0.8178, "num_input_tokens_seen": 29609504, "step": 51340 }, { "epoch": 7.6474530831099194, "grad_norm": 0.21389196813106537, "learning_rate": 7.967254234329347e-06, "loss": 0.788, "num_input_tokens_seen": 29612384, "step": 51345 }, { "epoch": 7.648197795650879, "grad_norm": 0.2473938912153244, "learning_rate": 7.962497688326934e-06, "loss": 0.7852, "num_input_tokens_seen": 29615488, "step": 51350 }, { "epoch": 7.648942508191838, "grad_norm": 0.261661559343338, "learning_rate": 7.95774229364965e-06, "loss": 0.7897, "num_input_tokens_seen": 29618816, "step": 51355 }, { "epoch": 7.6496872207327975, "grad_norm": 0.19705413281917572, "learning_rate": 7.952988050618862e-06, "loss": 0.7738, "num_input_tokens_seen": 29621504, "step": 51360 }, { "epoch": 7.650431933273756, "grad_norm": 0.19754353165626526, "learning_rate": 7.948234959555825e-06, "loss": 0.8053, "num_input_tokens_seen": 29624448, "step": 51365 }, { "epoch": 7.651176645814716, "grad_norm": 0.2523272633552551, "learning_rate": 7.943483020781741e-06, "loss": 0.7792, "num_input_tokens_seen": 29627488, "step": 51370 }, { "epoch": 7.651921358355675, "grad_norm": 0.22489576041698456, "learning_rate": 7.93873223461773e-06, "loss": 0.7943, "num_input_tokens_seen": 29630080, "step": 51375 }, { "epoch": 7.652666070896634, "grad_norm": 0.22109369933605194, "learning_rate": 7.933982601384825e-06, "loss": 0.7959, "num_input_tokens_seen": 29633056, "step": 51380 }, { "epoch": 7.653410783437593, "grad_norm": 0.1553312987089157, "learning_rate": 7.929234121403994e-06, "loss": 0.799, "num_input_tokens_seen": 29636128, "step": 51385 }, { "epoch": 7.654155495978552, "grad_norm": 0.26994240283966064, "learning_rate": 7.924486794996122e-06, "loss": 0.8007, "num_input_tokens_seen": 29639008, "step": 51390 }, { "epoch": 7.6549002085195115, "grad_norm": 0.1618269979953766, "learning_rate": 7.919740622482012e-06, "loss": 0.8024, "num_input_tokens_seen": 29641888, "step": 51395 }, { "epoch": 7.655644921060471, "grad_norm": 0.16103637218475342, "learning_rate": 7.914995604182373e-06, "loss": 0.8033, "num_input_tokens_seen": 29644736, "step": 51400 }, { "epoch": 7.65638963360143, "grad_norm": 0.14961625635623932, "learning_rate": 7.910251740417873e-06, "loss": 0.7994, "num_input_tokens_seen": 29647456, "step": 51405 }, { "epoch": 7.657134346142389, "grad_norm": 0.36856934428215027, "learning_rate": 7.905509031509079e-06, "loss": 0.8272, "num_input_tokens_seen": 29650656, "step": 51410 }, { "epoch": 7.657879058683348, "grad_norm": 0.24584901332855225, "learning_rate": 7.900767477776483e-06, "loss": 0.8012, "num_input_tokens_seen": 29653568, "step": 51415 }, { "epoch": 7.658623771224307, "grad_norm": 0.2014511525630951, "learning_rate": 7.896027079540508e-06, "loss": 0.82, "num_input_tokens_seen": 29656384, "step": 51420 }, { "epoch": 7.659368483765267, "grad_norm": 0.19764229655265808, "learning_rate": 7.891287837121472e-06, "loss": 0.7905, "num_input_tokens_seen": 29659232, "step": 51425 }, { "epoch": 7.6601131963062254, "grad_norm": 0.23605208098888397, "learning_rate": 7.886549750839658e-06, "loss": 0.7776, "num_input_tokens_seen": 29662240, "step": 51430 }, { "epoch": 7.660857908847185, "grad_norm": 0.19567225873470306, "learning_rate": 7.881812821015221e-06, "loss": 0.8426, "num_input_tokens_seen": 29665312, "step": 51435 }, { "epoch": 7.661602621388144, "grad_norm": 0.29142823815345764, "learning_rate": 7.877077047968282e-06, "loss": 0.7759, "num_input_tokens_seen": 29668192, "step": 51440 }, { "epoch": 7.6623473339291035, "grad_norm": 0.19280026853084564, "learning_rate": 7.872342432018856e-06, "loss": 0.7949, "num_input_tokens_seen": 29671040, "step": 51445 }, { "epoch": 7.663092046470062, "grad_norm": 0.1866975724697113, "learning_rate": 7.867608973486892e-06, "loss": 0.8012, "num_input_tokens_seen": 29673792, "step": 51450 }, { "epoch": 7.663836759011022, "grad_norm": 0.1822325736284256, "learning_rate": 7.862876672692265e-06, "loss": 0.8221, "num_input_tokens_seen": 29676736, "step": 51455 }, { "epoch": 7.664581471551981, "grad_norm": 0.24945800006389618, "learning_rate": 7.858145529954752e-06, "loss": 0.7849, "num_input_tokens_seen": 29679712, "step": 51460 }, { "epoch": 7.66532618409294, "grad_norm": 0.2307312935590744, "learning_rate": 7.853415545594073e-06, "loss": 0.796, "num_input_tokens_seen": 29682432, "step": 51465 }, { "epoch": 7.666070896633899, "grad_norm": 0.2612060308456421, "learning_rate": 7.848686719929863e-06, "loss": 0.7864, "num_input_tokens_seen": 29685184, "step": 51470 }, { "epoch": 7.666815609174859, "grad_norm": 0.2067686915397644, "learning_rate": 7.843959053281663e-06, "loss": 0.8226, "num_input_tokens_seen": 29687968, "step": 51475 }, { "epoch": 7.6675603217158175, "grad_norm": 0.29235801100730896, "learning_rate": 7.839232545968964e-06, "loss": 0.7992, "num_input_tokens_seen": 29690912, "step": 51480 }, { "epoch": 7.668305034256777, "grad_norm": 0.29302239418029785, "learning_rate": 7.834507198311154e-06, "loss": 0.8124, "num_input_tokens_seen": 29693792, "step": 51485 }, { "epoch": 7.669049746797736, "grad_norm": 0.22553277015686035, "learning_rate": 7.829783010627568e-06, "loss": 0.7819, "num_input_tokens_seen": 29696512, "step": 51490 }, { "epoch": 7.6697944593386955, "grad_norm": 0.22826167941093445, "learning_rate": 7.82505998323743e-06, "loss": 0.7734, "num_input_tokens_seen": 29699456, "step": 51495 }, { "epoch": 7.670539171879654, "grad_norm": 0.26057371497154236, "learning_rate": 7.820338116459908e-06, "loss": 0.7936, "num_input_tokens_seen": 29702304, "step": 51500 }, { "epoch": 7.671283884420614, "grad_norm": 0.22045865654945374, "learning_rate": 7.815617410614087e-06, "loss": 0.8023, "num_input_tokens_seen": 29705312, "step": 51505 }, { "epoch": 7.672028596961573, "grad_norm": 0.19849205017089844, "learning_rate": 7.81089786601898e-06, "loss": 0.8118, "num_input_tokens_seen": 29708320, "step": 51510 }, { "epoch": 7.672773309502532, "grad_norm": 0.19471703469753265, "learning_rate": 7.806179482993514e-06, "loss": 0.7885, "num_input_tokens_seen": 29711616, "step": 51515 }, { "epoch": 7.673518022043491, "grad_norm": 0.3420903980731964, "learning_rate": 7.801462261856526e-06, "loss": 0.8117, "num_input_tokens_seen": 29714848, "step": 51520 }, { "epoch": 7.674262734584451, "grad_norm": 0.20453104376792908, "learning_rate": 7.796746202926802e-06, "loss": 0.8162, "num_input_tokens_seen": 29717664, "step": 51525 }, { "epoch": 7.6750074471254095, "grad_norm": 0.22853659093379974, "learning_rate": 7.792031306523018e-06, "loss": 0.7965, "num_input_tokens_seen": 29720672, "step": 51530 }, { "epoch": 7.675752159666369, "grad_norm": 0.19128602743148804, "learning_rate": 7.787317572963798e-06, "loss": 0.8061, "num_input_tokens_seen": 29723808, "step": 51535 }, { "epoch": 7.676496872207328, "grad_norm": 0.2212265133857727, "learning_rate": 7.782605002567673e-06, "loss": 0.7923, "num_input_tokens_seen": 29726400, "step": 51540 }, { "epoch": 7.6772415847482876, "grad_norm": 0.22286023199558258, "learning_rate": 7.777893595653102e-06, "loss": 0.7924, "num_input_tokens_seen": 29729504, "step": 51545 }, { "epoch": 7.677986297289246, "grad_norm": 0.24157987534999847, "learning_rate": 7.77318335253846e-06, "loss": 0.8067, "num_input_tokens_seen": 29732672, "step": 51550 }, { "epoch": 7.678731009830205, "grad_norm": 0.18244123458862305, "learning_rate": 7.768474273542056e-06, "loss": 0.7908, "num_input_tokens_seen": 29735552, "step": 51555 }, { "epoch": 7.679475722371165, "grad_norm": 0.19997064769268036, "learning_rate": 7.763766358982104e-06, "loss": 0.8031, "num_input_tokens_seen": 29738624, "step": 51560 }, { "epoch": 7.680220434912124, "grad_norm": 0.19724629819393158, "learning_rate": 7.759059609176735e-06, "loss": 0.7991, "num_input_tokens_seen": 29741376, "step": 51565 }, { "epoch": 7.680965147453083, "grad_norm": 0.3184395134449005, "learning_rate": 7.754354024444021e-06, "loss": 0.8072, "num_input_tokens_seen": 29744192, "step": 51570 }, { "epoch": 7.681709859994042, "grad_norm": 0.25269994139671326, "learning_rate": 7.749649605101947e-06, "loss": 0.7841, "num_input_tokens_seen": 29746944, "step": 51575 }, { "epoch": 7.6824545725350015, "grad_norm": 0.22354203462600708, "learning_rate": 7.744946351468419e-06, "loss": 0.8064, "num_input_tokens_seen": 29749696, "step": 51580 }, { "epoch": 7.683199285075961, "grad_norm": 0.2383776158094406, "learning_rate": 7.740244263861268e-06, "loss": 0.793, "num_input_tokens_seen": 29752480, "step": 51585 }, { "epoch": 7.68394399761692, "grad_norm": 0.20649529993534088, "learning_rate": 7.73554334259823e-06, "loss": 0.8005, "num_input_tokens_seen": 29755232, "step": 51590 }, { "epoch": 7.684688710157879, "grad_norm": 0.18021589517593384, "learning_rate": 7.730843587996989e-06, "loss": 0.8055, "num_input_tokens_seen": 29758304, "step": 51595 }, { "epoch": 7.685433422698838, "grad_norm": 0.24267686903476715, "learning_rate": 7.72614500037512e-06, "loss": 0.8075, "num_input_tokens_seen": 29761344, "step": 51600 }, { "epoch": 7.686178135239797, "grad_norm": 0.35686925053596497, "learning_rate": 7.72144758005014e-06, "loss": 0.8056, "num_input_tokens_seen": 29764192, "step": 51605 }, { "epoch": 7.686922847780757, "grad_norm": 0.2592530846595764, "learning_rate": 7.716751327339484e-06, "loss": 0.8163, "num_input_tokens_seen": 29767008, "step": 51610 }, { "epoch": 7.6876675603217155, "grad_norm": 0.18604373931884766, "learning_rate": 7.712056242560503e-06, "loss": 0.7762, "num_input_tokens_seen": 29769920, "step": 51615 }, { "epoch": 7.688412272862675, "grad_norm": 0.2973596453666687, "learning_rate": 7.707362326030482e-06, "loss": 0.8171, "num_input_tokens_seen": 29772960, "step": 51620 }, { "epoch": 7.689156985403634, "grad_norm": 0.2008534073829651, "learning_rate": 7.702669578066602e-06, "loss": 0.7914, "num_input_tokens_seen": 29775872, "step": 51625 }, { "epoch": 7.6899016979445936, "grad_norm": 0.2148311585187912, "learning_rate": 7.697977998985984e-06, "loss": 0.7828, "num_input_tokens_seen": 29778912, "step": 51630 }, { "epoch": 7.690646410485552, "grad_norm": 0.2457837015390396, "learning_rate": 7.693287589105678e-06, "loss": 0.7897, "num_input_tokens_seen": 29781792, "step": 51635 }, { "epoch": 7.691391123026512, "grad_norm": 0.30044716596603394, "learning_rate": 7.68859834874262e-06, "loss": 0.8089, "num_input_tokens_seen": 29784672, "step": 51640 }, { "epoch": 7.692135835567471, "grad_norm": 0.19490642845630646, "learning_rate": 7.683910278213708e-06, "loss": 0.7905, "num_input_tokens_seen": 29787552, "step": 51645 }, { "epoch": 7.69288054810843, "grad_norm": 0.3069201409816742, "learning_rate": 7.679223377835735e-06, "loss": 0.8176, "num_input_tokens_seen": 29790496, "step": 51650 }, { "epoch": 7.693625260649389, "grad_norm": 0.22079209983348846, "learning_rate": 7.674537647925434e-06, "loss": 0.7928, "num_input_tokens_seen": 29793504, "step": 51655 }, { "epoch": 7.694369973190349, "grad_norm": 0.24148458242416382, "learning_rate": 7.669853088799432e-06, "loss": 0.8012, "num_input_tokens_seen": 29796416, "step": 51660 }, { "epoch": 7.6951146857313075, "grad_norm": 0.338890939950943, "learning_rate": 7.665169700774294e-06, "loss": 0.7835, "num_input_tokens_seen": 29799872, "step": 51665 }, { "epoch": 7.695859398272267, "grad_norm": 0.17878340184688568, "learning_rate": 7.660487484166513e-06, "loss": 0.797, "num_input_tokens_seen": 29802720, "step": 51670 }, { "epoch": 7.696604110813226, "grad_norm": 0.25858137011528015, "learning_rate": 7.655806439292498e-06, "loss": 0.8109, "num_input_tokens_seen": 29805472, "step": 51675 }, { "epoch": 7.697348823354186, "grad_norm": 0.20813550055027008, "learning_rate": 7.651126566468559e-06, "loss": 0.8068, "num_input_tokens_seen": 29808448, "step": 51680 }, { "epoch": 7.698093535895144, "grad_norm": 0.1983003169298172, "learning_rate": 7.64644786601095e-06, "loss": 0.8169, "num_input_tokens_seen": 29811200, "step": 51685 }, { "epoch": 7.698838248436104, "grad_norm": 0.20190571248531342, "learning_rate": 7.641770338235851e-06, "loss": 0.7881, "num_input_tokens_seen": 29814304, "step": 51690 }, { "epoch": 7.699582960977063, "grad_norm": 0.21440763771533966, "learning_rate": 7.637093983459329e-06, "loss": 0.8104, "num_input_tokens_seen": 29817088, "step": 51695 }, { "epoch": 7.700327673518022, "grad_norm": 0.24095772206783295, "learning_rate": 7.632418801997404e-06, "loss": 0.8037, "num_input_tokens_seen": 29819872, "step": 51700 }, { "epoch": 7.701072386058981, "grad_norm": 0.1808047741651535, "learning_rate": 7.627744794166003e-06, "loss": 0.778, "num_input_tokens_seen": 29822592, "step": 51705 }, { "epoch": 7.701817098599941, "grad_norm": 0.24877990782260895, "learning_rate": 7.623071960280981e-06, "loss": 0.8262, "num_input_tokens_seen": 29825440, "step": 51710 }, { "epoch": 7.7025618111408996, "grad_norm": 0.17083711922168732, "learning_rate": 7.618400300658113e-06, "loss": 0.7885, "num_input_tokens_seen": 29828320, "step": 51715 }, { "epoch": 7.703306523681858, "grad_norm": 0.24661938846111298, "learning_rate": 7.613729815613077e-06, "loss": 0.7982, "num_input_tokens_seen": 29831232, "step": 51720 }, { "epoch": 7.704051236222818, "grad_norm": 0.2862043082714081, "learning_rate": 7.609060505461499e-06, "loss": 0.8044, "num_input_tokens_seen": 29834336, "step": 51725 }, { "epoch": 7.704795948763778, "grad_norm": 0.19044283032417297, "learning_rate": 7.6043923705189005e-06, "loss": 0.8109, "num_input_tokens_seen": 29837440, "step": 51730 }, { "epoch": 7.705540661304736, "grad_norm": 0.2601523697376251, "learning_rate": 7.599725411100739e-06, "loss": 0.8065, "num_input_tokens_seen": 29840416, "step": 51735 }, { "epoch": 7.706285373845695, "grad_norm": 0.30829399824142456, "learning_rate": 7.595059627522389e-06, "loss": 0.8136, "num_input_tokens_seen": 29843552, "step": 51740 }, { "epoch": 7.707030086386655, "grad_norm": 0.17107419669628143, "learning_rate": 7.590395020099145e-06, "loss": 0.8257, "num_input_tokens_seen": 29846240, "step": 51745 }, { "epoch": 7.707774798927614, "grad_norm": 0.18481916189193726, "learning_rate": 7.585731589146234e-06, "loss": 0.7898, "num_input_tokens_seen": 29848992, "step": 51750 }, { "epoch": 7.708519511468573, "grad_norm": 0.19738242030143738, "learning_rate": 7.581069334978771e-06, "loss": 0.812, "num_input_tokens_seen": 29851872, "step": 51755 }, { "epoch": 7.709264224009532, "grad_norm": 0.22575411200523376, "learning_rate": 7.57640825791183e-06, "loss": 0.8021, "num_input_tokens_seen": 29854688, "step": 51760 }, { "epoch": 7.710008936550492, "grad_norm": 0.21680603921413422, "learning_rate": 7.571748358260372e-06, "loss": 0.8037, "num_input_tokens_seen": 29857280, "step": 51765 }, { "epoch": 7.71075364909145, "grad_norm": 0.2575710117816925, "learning_rate": 7.567089636339303e-06, "loss": 0.7971, "num_input_tokens_seen": 29860064, "step": 51770 }, { "epoch": 7.71149836163241, "grad_norm": 0.1983192414045334, "learning_rate": 7.562432092463439e-06, "loss": 0.7946, "num_input_tokens_seen": 29863136, "step": 51775 }, { "epoch": 7.712243074173369, "grad_norm": 0.2549301087856293, "learning_rate": 7.557775726947519e-06, "loss": 0.7895, "num_input_tokens_seen": 29865920, "step": 51780 }, { "epoch": 7.712987786714328, "grad_norm": 0.21236245334148407, "learning_rate": 7.553120540106206e-06, "loss": 0.8031, "num_input_tokens_seen": 29868928, "step": 51785 }, { "epoch": 7.713732499255287, "grad_norm": 0.19077318906784058, "learning_rate": 7.5484665322540675e-06, "loss": 0.7922, "num_input_tokens_seen": 29871712, "step": 51790 }, { "epoch": 7.714477211796247, "grad_norm": 0.17935311794281006, "learning_rate": 7.543813703705616e-06, "loss": 0.8102, "num_input_tokens_seen": 29874688, "step": 51795 }, { "epoch": 7.7152219243372056, "grad_norm": 0.23816804587841034, "learning_rate": 7.539162054775253e-06, "loss": 0.8027, "num_input_tokens_seen": 29877536, "step": 51800 }, { "epoch": 7.715966636878165, "grad_norm": 0.18435432016849518, "learning_rate": 7.534511585777326e-06, "loss": 0.8024, "num_input_tokens_seen": 29880480, "step": 51805 }, { "epoch": 7.716711349419124, "grad_norm": 0.275669127702713, "learning_rate": 7.529862297026099e-06, "loss": 0.7724, "num_input_tokens_seen": 29883424, "step": 51810 }, { "epoch": 7.717456061960084, "grad_norm": 0.254846453666687, "learning_rate": 7.525214188835749e-06, "loss": 0.8155, "num_input_tokens_seen": 29886176, "step": 51815 }, { "epoch": 7.718200774501042, "grad_norm": 0.23414568603038788, "learning_rate": 7.520567261520387e-06, "loss": 0.8109, "num_input_tokens_seen": 29889088, "step": 51820 }, { "epoch": 7.718945487042002, "grad_norm": 0.23973631858825684, "learning_rate": 7.515921515394014e-06, "loss": 0.785, "num_input_tokens_seen": 29891744, "step": 51825 }, { "epoch": 7.719690199582961, "grad_norm": 0.2838503420352936, "learning_rate": 7.51127695077058e-06, "loss": 0.8065, "num_input_tokens_seen": 29894624, "step": 51830 }, { "epoch": 7.72043491212392, "grad_norm": 0.19543500244617462, "learning_rate": 7.506633567963953e-06, "loss": 0.8001, "num_input_tokens_seen": 29897568, "step": 51835 }, { "epoch": 7.721179624664879, "grad_norm": 0.2524951696395874, "learning_rate": 7.501991367287897e-06, "loss": 0.7996, "num_input_tokens_seen": 29900512, "step": 51840 }, { "epoch": 7.721924337205839, "grad_norm": 0.19218607246875763, "learning_rate": 7.497350349056126e-06, "loss": 0.8159, "num_input_tokens_seen": 29903136, "step": 51845 }, { "epoch": 7.722669049746798, "grad_norm": 0.3185689449310303, "learning_rate": 7.492710513582257e-06, "loss": 0.7851, "num_input_tokens_seen": 29906208, "step": 51850 }, { "epoch": 7.723413762287757, "grad_norm": 0.2073054015636444, "learning_rate": 7.488071861179838e-06, "loss": 0.8337, "num_input_tokens_seen": 29908768, "step": 51855 }, { "epoch": 7.724158474828716, "grad_norm": 0.22033947706222534, "learning_rate": 7.4834343921623165e-06, "loss": 0.8041, "num_input_tokens_seen": 29911776, "step": 51860 }, { "epoch": 7.724903187369676, "grad_norm": 0.2682039141654968, "learning_rate": 7.478798106843085e-06, "loss": 0.8141, "num_input_tokens_seen": 29914752, "step": 51865 }, { "epoch": 7.725647899910634, "grad_norm": 0.23550459742546082, "learning_rate": 7.474163005535439e-06, "loss": 0.7892, "num_input_tokens_seen": 29917312, "step": 51870 }, { "epoch": 7.726392612451594, "grad_norm": 0.17183877527713776, "learning_rate": 7.46952908855261e-06, "loss": 0.7916, "num_input_tokens_seen": 29920224, "step": 51875 }, { "epoch": 7.727137324992553, "grad_norm": 0.2312789261341095, "learning_rate": 7.464896356207723e-06, "loss": 0.81, "num_input_tokens_seen": 29923040, "step": 51880 }, { "epoch": 7.727882037533512, "grad_norm": 0.18735942244529724, "learning_rate": 7.460264808813849e-06, "loss": 0.7894, "num_input_tokens_seen": 29926176, "step": 51885 }, { "epoch": 7.728626750074471, "grad_norm": 0.3940179646015167, "learning_rate": 7.455634446683976e-06, "loss": 0.7852, "num_input_tokens_seen": 29929600, "step": 51890 }, { "epoch": 7.729371462615431, "grad_norm": 0.23907268047332764, "learning_rate": 7.451005270130987e-06, "loss": 0.8053, "num_input_tokens_seen": 29932384, "step": 51895 }, { "epoch": 7.73011617515639, "grad_norm": 0.2532368004322052, "learning_rate": 7.4463772794677145e-06, "loss": 0.7978, "num_input_tokens_seen": 29935584, "step": 51900 }, { "epoch": 7.730860887697348, "grad_norm": 0.260505348443985, "learning_rate": 7.441750475006898e-06, "loss": 0.7895, "num_input_tokens_seen": 29938368, "step": 51905 }, { "epoch": 7.731605600238308, "grad_norm": 0.19538050889968872, "learning_rate": 7.4371248570611975e-06, "loss": 0.777, "num_input_tokens_seen": 29941056, "step": 51910 }, { "epoch": 7.732350312779268, "grad_norm": 0.19302459061145782, "learning_rate": 7.4325004259432006e-06, "loss": 0.7873, "num_input_tokens_seen": 29943808, "step": 51915 }, { "epoch": 7.733095025320226, "grad_norm": 0.26273882389068604, "learning_rate": 7.427877181965393e-06, "loss": 0.8075, "num_input_tokens_seen": 29946816, "step": 51920 }, { "epoch": 7.733839737861185, "grad_norm": 0.2576335668563843, "learning_rate": 7.423255125440212e-06, "loss": 0.8049, "num_input_tokens_seen": 29949760, "step": 51925 }, { "epoch": 7.734584450402145, "grad_norm": 0.24369589984416962, "learning_rate": 7.418634256679976e-06, "loss": 0.804, "num_input_tokens_seen": 29952576, "step": 51930 }, { "epoch": 7.735329162943104, "grad_norm": 0.19958682358264923, "learning_rate": 7.414014575996961e-06, "loss": 0.8051, "num_input_tokens_seen": 29955360, "step": 51935 }, { "epoch": 7.736073875484063, "grad_norm": 0.3419366180896759, "learning_rate": 7.409396083703341e-06, "loss": 0.8103, "num_input_tokens_seen": 29957920, "step": 51940 }, { "epoch": 7.736818588025022, "grad_norm": 0.17107681930065155, "learning_rate": 7.404778780111213e-06, "loss": 0.8161, "num_input_tokens_seen": 29960736, "step": 51945 }, { "epoch": 7.737563300565982, "grad_norm": 0.2064715176820755, "learning_rate": 7.400162665532606e-06, "loss": 0.7975, "num_input_tokens_seen": 29963616, "step": 51950 }, { "epoch": 7.73830801310694, "grad_norm": 0.19752706587314606, "learning_rate": 7.3955477402794435e-06, "loss": 0.7783, "num_input_tokens_seen": 29966400, "step": 51955 }, { "epoch": 7.7390527256479, "grad_norm": 0.17400719225406647, "learning_rate": 7.390934004663597e-06, "loss": 0.8029, "num_input_tokens_seen": 29969248, "step": 51960 }, { "epoch": 7.739797438188859, "grad_norm": 0.33516213297843933, "learning_rate": 7.386321458996831e-06, "loss": 0.8148, "num_input_tokens_seen": 29972096, "step": 51965 }, { "epoch": 7.740542150729818, "grad_norm": 0.20848780870437622, "learning_rate": 7.381710103590847e-06, "loss": 0.8137, "num_input_tokens_seen": 29975072, "step": 51970 }, { "epoch": 7.741286863270777, "grad_norm": 0.2615157663822174, "learning_rate": 7.377099938757265e-06, "loss": 0.81, "num_input_tokens_seen": 29978080, "step": 51975 }, { "epoch": 7.742031575811737, "grad_norm": 0.2832684814929962, "learning_rate": 7.372490964807619e-06, "loss": 0.7883, "num_input_tokens_seen": 29980928, "step": 51980 }, { "epoch": 7.742776288352696, "grad_norm": 0.2240033745765686, "learning_rate": 7.367883182053373e-06, "loss": 0.7999, "num_input_tokens_seen": 29983904, "step": 51985 }, { "epoch": 7.743521000893655, "grad_norm": 0.21067510545253754, "learning_rate": 7.363276590805887e-06, "loss": 0.8034, "num_input_tokens_seen": 29986720, "step": 51990 }, { "epoch": 7.744265713434614, "grad_norm": 0.3140665888786316, "learning_rate": 7.358671191376474e-06, "loss": 0.8128, "num_input_tokens_seen": 29989600, "step": 51995 }, { "epoch": 7.745010425975574, "grad_norm": 0.1709834188222885, "learning_rate": 7.3540669840763246e-06, "loss": 0.8251, "num_input_tokens_seen": 29992608, "step": 52000 }, { "epoch": 7.745755138516532, "grad_norm": 0.2480539232492447, "learning_rate": 7.349463969216589e-06, "loss": 0.8053, "num_input_tokens_seen": 29995968, "step": 52005 }, { "epoch": 7.746499851057492, "grad_norm": 0.19847309589385986, "learning_rate": 7.344862147108314e-06, "loss": 0.8268, "num_input_tokens_seen": 29998816, "step": 52010 }, { "epoch": 7.747244563598451, "grad_norm": 0.26033976674079895, "learning_rate": 7.340261518062475e-06, "loss": 0.7921, "num_input_tokens_seen": 30001792, "step": 52015 }, { "epoch": 7.7479892761394105, "grad_norm": 0.2838147282600403, "learning_rate": 7.335662082389972e-06, "loss": 0.7915, "num_input_tokens_seen": 30004512, "step": 52020 }, { "epoch": 7.748733988680369, "grad_norm": 0.23706550896167755, "learning_rate": 7.3310638404016005e-06, "loss": 0.8009, "num_input_tokens_seen": 30007424, "step": 52025 }, { "epoch": 7.749478701221329, "grad_norm": 0.19686192274093628, "learning_rate": 7.326466792408096e-06, "loss": 0.7885, "num_input_tokens_seen": 30010368, "step": 52030 }, { "epoch": 7.750223413762288, "grad_norm": 0.2025611847639084, "learning_rate": 7.321870938720118e-06, "loss": 0.7726, "num_input_tokens_seen": 30013568, "step": 52035 }, { "epoch": 7.750968126303247, "grad_norm": 0.25124675035476685, "learning_rate": 7.317276279648222e-06, "loss": 0.7754, "num_input_tokens_seen": 30016768, "step": 52040 }, { "epoch": 7.751712838844206, "grad_norm": 0.22406841814517975, "learning_rate": 7.3126828155029024e-06, "loss": 0.7925, "num_input_tokens_seen": 30019488, "step": 52045 }, { "epoch": 7.752457551385166, "grad_norm": 0.1941157877445221, "learning_rate": 7.308090546594565e-06, "loss": 0.8209, "num_input_tokens_seen": 30022272, "step": 52050 }, { "epoch": 7.753202263926124, "grad_norm": 0.16484691202640533, "learning_rate": 7.303499473233546e-06, "loss": 0.8186, "num_input_tokens_seen": 30025216, "step": 52055 }, { "epoch": 7.753946976467084, "grad_norm": 0.15779617428779602, "learning_rate": 7.2989095957300804e-06, "loss": 0.739, "num_input_tokens_seen": 30029024, "step": 52060 }, { "epoch": 7.754691689008043, "grad_norm": 0.2514156401157379, "learning_rate": 7.294320914394331e-06, "loss": 0.7825, "num_input_tokens_seen": 30032192, "step": 52065 }, { "epoch": 7.755436401549002, "grad_norm": 0.1885775923728943, "learning_rate": 7.289733429536391e-06, "loss": 0.8184, "num_input_tokens_seen": 30035040, "step": 52070 }, { "epoch": 7.756181114089961, "grad_norm": 0.1925196647644043, "learning_rate": 7.285147141466269e-06, "loss": 0.8323, "num_input_tokens_seen": 30037728, "step": 52075 }, { "epoch": 7.756925826630921, "grad_norm": 0.18997274339199066, "learning_rate": 7.280562050493872e-06, "loss": 0.7858, "num_input_tokens_seen": 30040672, "step": 52080 }, { "epoch": 7.75767053917188, "grad_norm": 0.21574673056602478, "learning_rate": 7.2759781569290506e-06, "loss": 0.7851, "num_input_tokens_seen": 30043712, "step": 52085 }, { "epoch": 7.758415251712838, "grad_norm": 0.2065753936767578, "learning_rate": 7.27139546108157e-06, "loss": 0.7915, "num_input_tokens_seen": 30046432, "step": 52090 }, { "epoch": 7.759159964253798, "grad_norm": 0.22279886901378632, "learning_rate": 7.266813963261099e-06, "loss": 0.7987, "num_input_tokens_seen": 30049504, "step": 52095 }, { "epoch": 7.759904676794758, "grad_norm": 0.16901470720767975, "learning_rate": 7.262233663777243e-06, "loss": 0.7851, "num_input_tokens_seen": 30052832, "step": 52100 }, { "epoch": 7.7606493893357165, "grad_norm": 0.1782180815935135, "learning_rate": 7.257654562939517e-06, "loss": 0.7989, "num_input_tokens_seen": 30055456, "step": 52105 }, { "epoch": 7.761394101876675, "grad_norm": 0.2848965525627136, "learning_rate": 7.253076661057362e-06, "loss": 0.8152, "num_input_tokens_seen": 30058208, "step": 52110 }, { "epoch": 7.762138814417635, "grad_norm": 0.12466694414615631, "learning_rate": 7.248499958440141e-06, "loss": 0.8197, "num_input_tokens_seen": 30060928, "step": 52115 }, { "epoch": 7.762883526958594, "grad_norm": 0.24359630048274994, "learning_rate": 7.243924455397111e-06, "loss": 0.8141, "num_input_tokens_seen": 30063648, "step": 52120 }, { "epoch": 7.763628239499553, "grad_norm": 0.16971449553966522, "learning_rate": 7.2393501522374844e-06, "loss": 0.7756, "num_input_tokens_seen": 30066656, "step": 52125 }, { "epoch": 7.764372952040512, "grad_norm": 0.26046034693717957, "learning_rate": 7.234777049270358e-06, "loss": 0.8268, "num_input_tokens_seen": 30069504, "step": 52130 }, { "epoch": 7.765117664581472, "grad_norm": 0.31830742955207825, "learning_rate": 7.230205146804769e-06, "loss": 0.7905, "num_input_tokens_seen": 30072512, "step": 52135 }, { "epoch": 7.76586237712243, "grad_norm": 0.18665021657943726, "learning_rate": 7.2256344451496676e-06, "loss": 0.7924, "num_input_tokens_seen": 30075360, "step": 52140 }, { "epoch": 7.76660708966339, "grad_norm": 0.28057458996772766, "learning_rate": 7.221064944613929e-06, "loss": 0.7912, "num_input_tokens_seen": 30078304, "step": 52145 }, { "epoch": 7.767351802204349, "grad_norm": 0.3474206328392029, "learning_rate": 7.2164966455063435e-06, "loss": 0.8223, "num_input_tokens_seen": 30081184, "step": 52150 }, { "epoch": 7.7680965147453085, "grad_norm": 0.1894458383321762, "learning_rate": 7.2119295481356044e-06, "loss": 0.8019, "num_input_tokens_seen": 30084000, "step": 52155 }, { "epoch": 7.768841227286267, "grad_norm": 0.16844546794891357, "learning_rate": 7.2073636528103535e-06, "loss": 0.8174, "num_input_tokens_seen": 30086944, "step": 52160 }, { "epoch": 7.769585939827227, "grad_norm": 0.2752223610877991, "learning_rate": 7.202798959839119e-06, "loss": 0.8176, "num_input_tokens_seen": 30090048, "step": 52165 }, { "epoch": 7.770330652368186, "grad_norm": 0.19943980872631073, "learning_rate": 7.198235469530374e-06, "loss": 0.7768, "num_input_tokens_seen": 30093088, "step": 52170 }, { "epoch": 7.771075364909145, "grad_norm": 0.15857551991939545, "learning_rate": 7.193673182192498e-06, "loss": 0.8317, "num_input_tokens_seen": 30096128, "step": 52175 }, { "epoch": 7.771820077450104, "grad_norm": 0.2806297838687897, "learning_rate": 7.189112098133793e-06, "loss": 0.817, "num_input_tokens_seen": 30099040, "step": 52180 }, { "epoch": 7.772564789991064, "grad_norm": 0.24600015580654144, "learning_rate": 7.184552217662488e-06, "loss": 0.813, "num_input_tokens_seen": 30101888, "step": 52185 }, { "epoch": 7.7733095025320225, "grad_norm": 0.18656882643699646, "learning_rate": 7.179993541086702e-06, "loss": 0.8056, "num_input_tokens_seen": 30104640, "step": 52190 }, { "epoch": 7.774054215072982, "grad_norm": 0.2905525267124176, "learning_rate": 7.175436068714503e-06, "loss": 0.771, "num_input_tokens_seen": 30108096, "step": 52195 }, { "epoch": 7.774798927613941, "grad_norm": 0.16074629127979279, "learning_rate": 7.170879800853872e-06, "loss": 0.817, "num_input_tokens_seen": 30110912, "step": 52200 }, { "epoch": 7.7755436401549005, "grad_norm": 0.17694896459579468, "learning_rate": 7.166324737812688e-06, "loss": 0.8025, "num_input_tokens_seen": 30113536, "step": 52205 }, { "epoch": 7.776288352695859, "grad_norm": 0.20324935019016266, "learning_rate": 7.161770879898771e-06, "loss": 0.8077, "num_input_tokens_seen": 30116672, "step": 52210 }, { "epoch": 7.777033065236819, "grad_norm": 0.25407180190086365, "learning_rate": 7.1572182274198564e-06, "loss": 0.8198, "num_input_tokens_seen": 30119648, "step": 52215 }, { "epoch": 7.777777777777778, "grad_norm": 0.2564597427845001, "learning_rate": 7.152666780683595e-06, "loss": 0.8195, "num_input_tokens_seen": 30122560, "step": 52220 }, { "epoch": 7.778522490318737, "grad_norm": 0.1766975075006485, "learning_rate": 7.148116539997546e-06, "loss": 0.819, "num_input_tokens_seen": 30125568, "step": 52225 }, { "epoch": 7.779267202859696, "grad_norm": 0.2843279540538788, "learning_rate": 7.143567505669199e-06, "loss": 0.8032, "num_input_tokens_seen": 30128864, "step": 52230 }, { "epoch": 7.780011915400656, "grad_norm": 0.21462157368659973, "learning_rate": 7.139019678005959e-06, "loss": 0.8148, "num_input_tokens_seen": 30131744, "step": 52235 }, { "epoch": 7.7807566279416145, "grad_norm": 0.21561108529567719, "learning_rate": 7.134473057315163e-06, "loss": 0.8062, "num_input_tokens_seen": 30134432, "step": 52240 }, { "epoch": 7.781501340482574, "grad_norm": 0.17996035516262054, "learning_rate": 7.129927643904033e-06, "loss": 0.7848, "num_input_tokens_seen": 30137248, "step": 52245 }, { "epoch": 7.782246053023533, "grad_norm": 0.2875627279281616, "learning_rate": 7.125383438079736e-06, "loss": 0.7966, "num_input_tokens_seen": 30140096, "step": 52250 }, { "epoch": 7.782990765564492, "grad_norm": 0.22496721148490906, "learning_rate": 7.120840440149365e-06, "loss": 0.7933, "num_input_tokens_seen": 30142784, "step": 52255 }, { "epoch": 7.783735478105451, "grad_norm": 0.23311282694339752, "learning_rate": 7.1162986504198945e-06, "loss": 0.7919, "num_input_tokens_seen": 30145568, "step": 52260 }, { "epoch": 7.784480190646411, "grad_norm": 0.263627290725708, "learning_rate": 7.1117580691982545e-06, "loss": 0.7728, "num_input_tokens_seen": 30148896, "step": 52265 }, { "epoch": 7.78522490318737, "grad_norm": 0.18816709518432617, "learning_rate": 7.107218696791273e-06, "loss": 0.7953, "num_input_tokens_seen": 30151552, "step": 52270 }, { "epoch": 7.7859696157283285, "grad_norm": 0.21250124275684357, "learning_rate": 7.102680533505707e-06, "loss": 0.7701, "num_input_tokens_seen": 30154688, "step": 52275 }, { "epoch": 7.786714328269288, "grad_norm": 0.15478651225566864, "learning_rate": 7.0981435796482306e-06, "loss": 0.8041, "num_input_tokens_seen": 30157728, "step": 52280 }, { "epoch": 7.787459040810247, "grad_norm": 0.22265629470348358, "learning_rate": 7.093607835525423e-06, "loss": 0.8189, "num_input_tokens_seen": 30160672, "step": 52285 }, { "epoch": 7.7882037533512065, "grad_norm": 0.20892104506492615, "learning_rate": 7.089073301443802e-06, "loss": 0.8138, "num_input_tokens_seen": 30163584, "step": 52290 }, { "epoch": 7.788948465892165, "grad_norm": 0.21683967113494873, "learning_rate": 7.084539977709778e-06, "loss": 0.7902, "num_input_tokens_seen": 30166368, "step": 52295 }, { "epoch": 7.789693178433125, "grad_norm": 0.19747938215732574, "learning_rate": 7.080007864629706e-06, "loss": 0.7886, "num_input_tokens_seen": 30169088, "step": 52300 }, { "epoch": 7.790437890974084, "grad_norm": 0.1839890331029892, "learning_rate": 7.075476962509845e-06, "loss": 0.806, "num_input_tokens_seen": 30171712, "step": 52305 }, { "epoch": 7.791182603515043, "grad_norm": 0.25946491956710815, "learning_rate": 7.070947271656372e-06, "loss": 0.8037, "num_input_tokens_seen": 30174720, "step": 52310 }, { "epoch": 7.791927316056002, "grad_norm": 0.24064010381698608, "learning_rate": 7.0664187923753984e-06, "loss": 0.7842, "num_input_tokens_seen": 30177632, "step": 52315 }, { "epoch": 7.792672028596962, "grad_norm": 0.33449891209602356, "learning_rate": 7.061891524972927e-06, "loss": 0.7888, "num_input_tokens_seen": 30180928, "step": 52320 }, { "epoch": 7.7934167411379205, "grad_norm": 0.31355929374694824, "learning_rate": 7.057365469754892e-06, "loss": 0.8075, "num_input_tokens_seen": 30183552, "step": 52325 }, { "epoch": 7.79416145367888, "grad_norm": 0.25412717461586, "learning_rate": 7.052840627027146e-06, "loss": 0.794, "num_input_tokens_seen": 30186432, "step": 52330 }, { "epoch": 7.794906166219839, "grad_norm": 0.27699339389801025, "learning_rate": 7.048316997095464e-06, "loss": 0.7754, "num_input_tokens_seen": 30189472, "step": 52335 }, { "epoch": 7.7956508787607985, "grad_norm": 0.19633813202381134, "learning_rate": 7.0437945802655334e-06, "loss": 0.808, "num_input_tokens_seen": 30192160, "step": 52340 }, { "epoch": 7.796395591301757, "grad_norm": 0.22488915920257568, "learning_rate": 7.039273376842958e-06, "loss": 0.8144, "num_input_tokens_seen": 30195136, "step": 52345 }, { "epoch": 7.797140303842717, "grad_norm": 0.31660425662994385, "learning_rate": 7.034753387133275e-06, "loss": 0.7799, "num_input_tokens_seen": 30198208, "step": 52350 }, { "epoch": 7.797885016383676, "grad_norm": 0.25886771082878113, "learning_rate": 7.03023461144191e-06, "loss": 0.8012, "num_input_tokens_seen": 30201088, "step": 52355 }, { "epoch": 7.798629728924635, "grad_norm": 0.2814798653125763, "learning_rate": 7.025717050074235e-06, "loss": 0.7886, "num_input_tokens_seen": 30204192, "step": 52360 }, { "epoch": 7.799374441465594, "grad_norm": 0.20942533016204834, "learning_rate": 7.021200703335518e-06, "loss": 0.7819, "num_input_tokens_seen": 30206976, "step": 52365 }, { "epoch": 7.800119154006554, "grad_norm": 0.22063882648944855, "learning_rate": 7.01668557153096e-06, "loss": 0.7581, "num_input_tokens_seen": 30209696, "step": 52370 }, { "epoch": 7.8008638665475125, "grad_norm": 0.24054278433322906, "learning_rate": 7.012171654965677e-06, "loss": 0.801, "num_input_tokens_seen": 30212256, "step": 52375 }, { "epoch": 7.801608579088472, "grad_norm": 0.20554593205451965, "learning_rate": 7.007658953944699e-06, "loss": 0.7856, "num_input_tokens_seen": 30215264, "step": 52380 }, { "epoch": 7.802353291629431, "grad_norm": 0.23502466082572937, "learning_rate": 7.003147468772986e-06, "loss": 0.8083, "num_input_tokens_seen": 30218016, "step": 52385 }, { "epoch": 7.803098004170391, "grad_norm": 0.16793696582317352, "learning_rate": 6.998637199755389e-06, "loss": 0.7991, "num_input_tokens_seen": 30220832, "step": 52390 }, { "epoch": 7.803842716711349, "grad_norm": 0.21653856337070465, "learning_rate": 6.994128147196702e-06, "loss": 0.7824, "num_input_tokens_seen": 30223616, "step": 52395 }, { "epoch": 7.804587429252309, "grad_norm": 0.3190653324127197, "learning_rate": 6.989620311401637e-06, "loss": 0.8193, "num_input_tokens_seen": 30226464, "step": 52400 }, { "epoch": 7.805332141793268, "grad_norm": 0.22716104984283447, "learning_rate": 6.985113692674797e-06, "loss": 0.7974, "num_input_tokens_seen": 30229248, "step": 52405 }, { "epoch": 7.806076854334227, "grad_norm": 0.20441822707653046, "learning_rate": 6.980608291320731e-06, "loss": 0.8001, "num_input_tokens_seen": 30232192, "step": 52410 }, { "epoch": 7.806821566875186, "grad_norm": 0.2508077919483185, "learning_rate": 6.976104107643896e-06, "loss": 0.8117, "num_input_tokens_seen": 30235008, "step": 52415 }, { "epoch": 7.807566279416145, "grad_norm": 0.16365648806095123, "learning_rate": 6.9716011419486745e-06, "loss": 0.7702, "num_input_tokens_seen": 30237792, "step": 52420 }, { "epoch": 7.8083109919571045, "grad_norm": 0.24098509550094604, "learning_rate": 6.96709939453934e-06, "loss": 0.7955, "num_input_tokens_seen": 30240928, "step": 52425 }, { "epoch": 7.809055704498064, "grad_norm": 0.2821339964866638, "learning_rate": 6.962598865720113e-06, "loss": 0.7809, "num_input_tokens_seen": 30243776, "step": 52430 }, { "epoch": 7.809800417039023, "grad_norm": 0.28255507349967957, "learning_rate": 6.95809955579512e-06, "loss": 0.8061, "num_input_tokens_seen": 30246528, "step": 52435 }, { "epoch": 7.810545129579982, "grad_norm": 0.23568561673164368, "learning_rate": 6.95360146506841e-06, "loss": 0.783, "num_input_tokens_seen": 30249536, "step": 52440 }, { "epoch": 7.811289842120941, "grad_norm": 0.21244265139102936, "learning_rate": 6.949104593843939e-06, "loss": 0.7948, "num_input_tokens_seen": 30252576, "step": 52445 }, { "epoch": 7.812034554661901, "grad_norm": 0.26159635186195374, "learning_rate": 6.9446089424255875e-06, "loss": 0.7835, "num_input_tokens_seen": 30255488, "step": 52450 }, { "epoch": 7.81277926720286, "grad_norm": 0.23778629302978516, "learning_rate": 6.940114511117163e-06, "loss": 0.7672, "num_input_tokens_seen": 30258784, "step": 52455 }, { "epoch": 7.8135239797438185, "grad_norm": 0.17748518288135529, "learning_rate": 6.935621300222367e-06, "loss": 0.7821, "num_input_tokens_seen": 30261376, "step": 52460 }, { "epoch": 7.814268692284778, "grad_norm": 0.19527114927768707, "learning_rate": 6.93112931004484e-06, "loss": 0.7579, "num_input_tokens_seen": 30264224, "step": 52465 }, { "epoch": 7.815013404825737, "grad_norm": 0.25246095657348633, "learning_rate": 6.9266385408881305e-06, "loss": 0.8431, "num_input_tokens_seen": 30267136, "step": 52470 }, { "epoch": 7.815758117366697, "grad_norm": 0.17696848511695862, "learning_rate": 6.922148993055708e-06, "loss": 0.7885, "num_input_tokens_seen": 30269920, "step": 52475 }, { "epoch": 7.816502829907655, "grad_norm": 0.2778346836566925, "learning_rate": 6.917660666850964e-06, "loss": 0.7828, "num_input_tokens_seen": 30272800, "step": 52480 }, { "epoch": 7.817247542448615, "grad_norm": 0.22207261621952057, "learning_rate": 6.913173562577193e-06, "loss": 0.8137, "num_input_tokens_seen": 30275680, "step": 52485 }, { "epoch": 7.817992254989574, "grad_norm": 0.2470126748085022, "learning_rate": 6.908687680537615e-06, "loss": 0.8271, "num_input_tokens_seen": 30278656, "step": 52490 }, { "epoch": 7.818736967530533, "grad_norm": 0.31029361486434937, "learning_rate": 6.904203021035366e-06, "loss": 0.7699, "num_input_tokens_seen": 30281632, "step": 52495 }, { "epoch": 7.819481680071492, "grad_norm": 0.18358829617500305, "learning_rate": 6.899719584373504e-06, "loss": 0.7733, "num_input_tokens_seen": 30284416, "step": 52500 }, { "epoch": 7.820226392612452, "grad_norm": 0.23111747205257416, "learning_rate": 6.895237370855004e-06, "loss": 0.8227, "num_input_tokens_seen": 30287392, "step": 52505 }, { "epoch": 7.8209711051534105, "grad_norm": 0.3099401295185089, "learning_rate": 6.890756380782751e-06, "loss": 0.8026, "num_input_tokens_seen": 30290368, "step": 52510 }, { "epoch": 7.82171581769437, "grad_norm": 0.184278205037117, "learning_rate": 6.886276614459567e-06, "loss": 0.7871, "num_input_tokens_seen": 30293216, "step": 52515 }, { "epoch": 7.822460530235329, "grad_norm": 0.22087667882442474, "learning_rate": 6.881798072188159e-06, "loss": 0.8154, "num_input_tokens_seen": 30296224, "step": 52520 }, { "epoch": 7.823205242776289, "grad_norm": 0.23046660423278809, "learning_rate": 6.8773207542711716e-06, "loss": 0.7889, "num_input_tokens_seen": 30299232, "step": 52525 }, { "epoch": 7.823949955317247, "grad_norm": 0.22145628929138184, "learning_rate": 6.872844661011163e-06, "loss": 0.7736, "num_input_tokens_seen": 30302112, "step": 52530 }, { "epoch": 7.824694667858207, "grad_norm": 0.21710331737995148, "learning_rate": 6.868369792710613e-06, "loss": 0.7701, "num_input_tokens_seen": 30305120, "step": 52535 }, { "epoch": 7.825439380399166, "grad_norm": 0.22194066643714905, "learning_rate": 6.863896149671914e-06, "loss": 0.7866, "num_input_tokens_seen": 30307968, "step": 52540 }, { "epoch": 7.826184092940125, "grad_norm": 0.21825513243675232, "learning_rate": 6.859423732197379e-06, "loss": 0.8103, "num_input_tokens_seen": 30310752, "step": 52545 }, { "epoch": 7.826928805481084, "grad_norm": 0.21999235451221466, "learning_rate": 6.854952540589241e-06, "loss": 0.8023, "num_input_tokens_seen": 30313568, "step": 52550 }, { "epoch": 7.827673518022044, "grad_norm": 0.24322012066841125, "learning_rate": 6.850482575149631e-06, "loss": 0.8184, "num_input_tokens_seen": 30316576, "step": 52555 }, { "epoch": 7.828418230563003, "grad_norm": 0.23589017987251282, "learning_rate": 6.846013836180623e-06, "loss": 0.7974, "num_input_tokens_seen": 30319520, "step": 52560 }, { "epoch": 7.829162943103962, "grad_norm": 0.27003413438796997, "learning_rate": 6.8415463239841854e-06, "loss": 0.8081, "num_input_tokens_seen": 30322336, "step": 52565 }, { "epoch": 7.829907655644921, "grad_norm": 0.25761252641677856, "learning_rate": 6.83708003886222e-06, "loss": 0.7778, "num_input_tokens_seen": 30325184, "step": 52570 }, { "epoch": 7.830652368185881, "grad_norm": 0.21058358252048492, "learning_rate": 6.832614981116542e-06, "loss": 0.8124, "num_input_tokens_seen": 30327744, "step": 52575 }, { "epoch": 7.831397080726839, "grad_norm": 0.29032281041145325, "learning_rate": 6.8281511510488785e-06, "loss": 0.7832, "num_input_tokens_seen": 30330880, "step": 52580 }, { "epoch": 7.832141793267798, "grad_norm": 0.17000995576381683, "learning_rate": 6.8236885489608885e-06, "loss": 0.7921, "num_input_tokens_seen": 30333664, "step": 52585 }, { "epoch": 7.832886505808758, "grad_norm": 0.19929182529449463, "learning_rate": 6.819227175154117e-06, "loss": 0.815, "num_input_tokens_seen": 30336640, "step": 52590 }, { "epoch": 7.833631218349717, "grad_norm": 0.21466906368732452, "learning_rate": 6.814767029930055e-06, "loss": 0.8051, "num_input_tokens_seen": 30339456, "step": 52595 }, { "epoch": 7.834375930890676, "grad_norm": 0.22963984310626984, "learning_rate": 6.810308113590111e-06, "loss": 0.8004, "num_input_tokens_seen": 30342432, "step": 52600 }, { "epoch": 7.835120643431635, "grad_norm": 0.19894152879714966, "learning_rate": 6.805850426435581e-06, "loss": 0.8185, "num_input_tokens_seen": 30346432, "step": 52605 }, { "epoch": 7.835865355972595, "grad_norm": 0.22755500674247742, "learning_rate": 6.801393968767708e-06, "loss": 0.7891, "num_input_tokens_seen": 30349344, "step": 52610 }, { "epoch": 7.836610068513554, "grad_norm": 0.22874346375465393, "learning_rate": 6.796938740887643e-06, "loss": 0.7926, "num_input_tokens_seen": 30352032, "step": 52615 }, { "epoch": 7.837354781054513, "grad_norm": 0.2280908077955246, "learning_rate": 6.792484743096456e-06, "loss": 0.7673, "num_input_tokens_seen": 30354784, "step": 52620 }, { "epoch": 7.838099493595472, "grad_norm": 0.29437848925590515, "learning_rate": 6.788031975695114e-06, "loss": 0.82, "num_input_tokens_seen": 30358208, "step": 52625 }, { "epoch": 7.838844206136431, "grad_norm": 0.18760435283184052, "learning_rate": 6.783580438984527e-06, "loss": 0.8249, "num_input_tokens_seen": 30360928, "step": 52630 }, { "epoch": 7.83958891867739, "grad_norm": 0.29904234409332275, "learning_rate": 6.779130133265513e-06, "loss": 0.8126, "num_input_tokens_seen": 30363392, "step": 52635 }, { "epoch": 7.84033363121835, "grad_norm": 0.20362766087055206, "learning_rate": 6.774681058838811e-06, "loss": 0.833, "num_input_tokens_seen": 30366144, "step": 52640 }, { "epoch": 7.841078343759309, "grad_norm": 0.2859119772911072, "learning_rate": 6.770233216005056e-06, "loss": 0.8138, "num_input_tokens_seen": 30369280, "step": 52645 }, { "epoch": 7.841823056300268, "grad_norm": 0.23706206679344177, "learning_rate": 6.76578660506483e-06, "loss": 0.8023, "num_input_tokens_seen": 30372160, "step": 52650 }, { "epoch": 7.842567768841227, "grad_norm": 0.18703433871269226, "learning_rate": 6.7613412263186074e-06, "loss": 0.7908, "num_input_tokens_seen": 30375328, "step": 52655 }, { "epoch": 7.843312481382187, "grad_norm": 0.27031442523002625, "learning_rate": 6.756897080066788e-06, "loss": 0.7746, "num_input_tokens_seen": 30377920, "step": 52660 }, { "epoch": 7.844057193923145, "grad_norm": 0.17875775694847107, "learning_rate": 6.752454166609693e-06, "loss": 0.8198, "num_input_tokens_seen": 30380768, "step": 52665 }, { "epoch": 7.844801906464105, "grad_norm": 0.19259220361709595, "learning_rate": 6.748012486247557e-06, "loss": 0.7862, "num_input_tokens_seen": 30383424, "step": 52670 }, { "epoch": 7.845546619005064, "grad_norm": 0.20363321900367737, "learning_rate": 6.74357203928053e-06, "loss": 0.8283, "num_input_tokens_seen": 30386368, "step": 52675 }, { "epoch": 7.846291331546023, "grad_norm": 0.24529466032981873, "learning_rate": 6.7391328260086845e-06, "loss": 0.7934, "num_input_tokens_seen": 30389920, "step": 52680 }, { "epoch": 7.847036044086982, "grad_norm": 0.18236730992794037, "learning_rate": 6.7346948467320036e-06, "loss": 0.7635, "num_input_tokens_seen": 30392640, "step": 52685 }, { "epoch": 7.847780756627942, "grad_norm": 0.20752458274364471, "learning_rate": 6.730258101750372e-06, "loss": 0.8033, "num_input_tokens_seen": 30395264, "step": 52690 }, { "epoch": 7.848525469168901, "grad_norm": 0.2192007601261139, "learning_rate": 6.725822591363621e-06, "loss": 0.8042, "num_input_tokens_seen": 30398272, "step": 52695 }, { "epoch": 7.84927018170986, "grad_norm": 0.16940540075302124, "learning_rate": 6.721388315871482e-06, "loss": 0.8386, "num_input_tokens_seen": 30401184, "step": 52700 }, { "epoch": 7.850014894250819, "grad_norm": 0.2751452624797821, "learning_rate": 6.7169552755736055e-06, "loss": 0.7601, "num_input_tokens_seen": 30403744, "step": 52705 }, { "epoch": 7.850759606791779, "grad_norm": 0.35673007369041443, "learning_rate": 6.712523470769555e-06, "loss": 0.7887, "num_input_tokens_seen": 30407008, "step": 52710 }, { "epoch": 7.851504319332737, "grad_norm": 0.2230808287858963, "learning_rate": 6.708092901758828e-06, "loss": 0.81, "num_input_tokens_seen": 30409920, "step": 52715 }, { "epoch": 7.852249031873697, "grad_norm": 0.22626827657222748, "learning_rate": 6.703663568840804e-06, "loss": 0.7923, "num_input_tokens_seen": 30412544, "step": 52720 }, { "epoch": 7.852993744414656, "grad_norm": 0.18463736772537231, "learning_rate": 6.699235472314816e-06, "loss": 0.7902, "num_input_tokens_seen": 30415872, "step": 52725 }, { "epoch": 7.8537384569556155, "grad_norm": 0.3083188831806183, "learning_rate": 6.694808612480083e-06, "loss": 0.8329, "num_input_tokens_seen": 30419328, "step": 52730 }, { "epoch": 7.854483169496574, "grad_norm": 0.17854125797748566, "learning_rate": 6.6903829896357604e-06, "loss": 0.7962, "num_input_tokens_seen": 30422080, "step": 52735 }, { "epoch": 7.855227882037534, "grad_norm": 0.17962327599525452, "learning_rate": 6.6859586040809105e-06, "loss": 0.7766, "num_input_tokens_seen": 30424864, "step": 52740 }, { "epoch": 7.855972594578493, "grad_norm": 0.2990778982639313, "learning_rate": 6.681535456114521e-06, "loss": 0.8121, "num_input_tokens_seen": 30427680, "step": 52745 }, { "epoch": 7.856717307119452, "grad_norm": 0.24735113978385925, "learning_rate": 6.677113546035496e-06, "loss": 0.778, "num_input_tokens_seen": 30430592, "step": 52750 }, { "epoch": 7.857462019660411, "grad_norm": 0.25030893087387085, "learning_rate": 6.672692874142636e-06, "loss": 0.7865, "num_input_tokens_seen": 30433344, "step": 52755 }, { "epoch": 7.858206732201371, "grad_norm": 0.25901004672050476, "learning_rate": 6.668273440734676e-06, "loss": 0.831, "num_input_tokens_seen": 30436128, "step": 52760 }, { "epoch": 7.858951444742329, "grad_norm": 0.2587563097476959, "learning_rate": 6.663855246110273e-06, "loss": 0.8097, "num_input_tokens_seen": 30439200, "step": 52765 }, { "epoch": 7.859696157283288, "grad_norm": 0.26076260209083557, "learning_rate": 6.659438290567976e-06, "loss": 0.8444, "num_input_tokens_seen": 30442112, "step": 52770 }, { "epoch": 7.860440869824248, "grad_norm": 0.23170536756515503, "learning_rate": 6.655022574406272e-06, "loss": 0.8213, "num_input_tokens_seen": 30444640, "step": 52775 }, { "epoch": 7.8611855823652075, "grad_norm": 0.29374125599861145, "learning_rate": 6.650608097923558e-06, "loss": 0.7817, "num_input_tokens_seen": 30447680, "step": 52780 }, { "epoch": 7.861930294906166, "grad_norm": 0.1804036647081375, "learning_rate": 6.64619486141815e-06, "loss": 0.7863, "num_input_tokens_seen": 30450432, "step": 52785 }, { "epoch": 7.862675007447125, "grad_norm": 0.23303931951522827, "learning_rate": 6.641782865188267e-06, "loss": 0.7854, "num_input_tokens_seen": 30453248, "step": 52790 }, { "epoch": 7.863419719988085, "grad_norm": 0.21982352435588837, "learning_rate": 6.637372109532061e-06, "loss": 0.7733, "num_input_tokens_seen": 30456064, "step": 52795 }, { "epoch": 7.864164432529043, "grad_norm": 0.21491073071956635, "learning_rate": 6.632962594747588e-06, "loss": 0.8256, "num_input_tokens_seen": 30459136, "step": 52800 }, { "epoch": 7.864909145070003, "grad_norm": 0.2322845607995987, "learning_rate": 6.628554321132835e-06, "loss": 0.7932, "num_input_tokens_seen": 30461888, "step": 52805 }, { "epoch": 7.865653857610962, "grad_norm": 0.19949185848236084, "learning_rate": 6.624147288985682e-06, "loss": 0.779, "num_input_tokens_seen": 30464704, "step": 52810 }, { "epoch": 7.8663985701519215, "grad_norm": 0.20051638782024384, "learning_rate": 6.619741498603951e-06, "loss": 0.7782, "num_input_tokens_seen": 30467392, "step": 52815 }, { "epoch": 7.86714328269288, "grad_norm": 0.18176278471946716, "learning_rate": 6.615336950285356e-06, "loss": 0.793, "num_input_tokens_seen": 30470400, "step": 52820 }, { "epoch": 7.86788799523384, "grad_norm": 0.2386542111635208, "learning_rate": 6.610933644327541e-06, "loss": 0.8462, "num_input_tokens_seen": 30473248, "step": 52825 }, { "epoch": 7.868632707774799, "grad_norm": 0.25450918078422546, "learning_rate": 6.606531581028067e-06, "loss": 0.789, "num_input_tokens_seen": 30476160, "step": 52830 }, { "epoch": 7.869377420315758, "grad_norm": 0.17044569551944733, "learning_rate": 6.602130760684405e-06, "loss": 0.8005, "num_input_tokens_seen": 30479648, "step": 52835 }, { "epoch": 7.870122132856717, "grad_norm": 0.21184241771697998, "learning_rate": 6.597731183593947e-06, "loss": 0.7643, "num_input_tokens_seen": 30482400, "step": 52840 }, { "epoch": 7.870866845397677, "grad_norm": 0.2505221962928772, "learning_rate": 6.593332850054004e-06, "loss": 0.8103, "num_input_tokens_seen": 30485312, "step": 52845 }, { "epoch": 7.871611557938635, "grad_norm": 0.26169028878211975, "learning_rate": 6.588935760361789e-06, "loss": 0.787, "num_input_tokens_seen": 30488448, "step": 52850 }, { "epoch": 7.872356270479595, "grad_norm": 0.44207456707954407, "learning_rate": 6.584539914814439e-06, "loss": 0.7908, "num_input_tokens_seen": 30491456, "step": 52855 }, { "epoch": 7.873100983020554, "grad_norm": 0.2932758033275604, "learning_rate": 6.580145313709005e-06, "loss": 0.8157, "num_input_tokens_seen": 30494432, "step": 52860 }, { "epoch": 7.8738456955615135, "grad_norm": 0.20484183728694916, "learning_rate": 6.575751957342463e-06, "loss": 0.7853, "num_input_tokens_seen": 30497504, "step": 52865 }, { "epoch": 7.874590408102472, "grad_norm": 0.3275396525859833, "learning_rate": 6.571359846011696e-06, "loss": 0.8293, "num_input_tokens_seen": 30500320, "step": 52870 }, { "epoch": 7.875335120643432, "grad_norm": 0.22587600350379944, "learning_rate": 6.566968980013505e-06, "loss": 0.804, "num_input_tokens_seen": 30503264, "step": 52875 }, { "epoch": 7.876079833184391, "grad_norm": 0.24991101026535034, "learning_rate": 6.5625793596446165e-06, "loss": 0.8225, "num_input_tokens_seen": 30506368, "step": 52880 }, { "epoch": 7.87682454572535, "grad_norm": 0.23239479959011078, "learning_rate": 6.558190985201651e-06, "loss": 0.7863, "num_input_tokens_seen": 30509248, "step": 52885 }, { "epoch": 7.877569258266309, "grad_norm": 0.2391650229692459, "learning_rate": 6.553803856981152e-06, "loss": 0.7866, "num_input_tokens_seen": 30511904, "step": 52890 }, { "epoch": 7.878313970807269, "grad_norm": 0.1906706988811493, "learning_rate": 6.549417975279595e-06, "loss": 0.7895, "num_input_tokens_seen": 30514816, "step": 52895 }, { "epoch": 7.8790586833482275, "grad_norm": 0.22093041241168976, "learning_rate": 6.545033340393356e-06, "loss": 0.8186, "num_input_tokens_seen": 30517728, "step": 52900 }, { "epoch": 7.879803395889187, "grad_norm": 0.19199086725711823, "learning_rate": 6.540649952618727e-06, "loss": 0.8433, "num_input_tokens_seen": 30520512, "step": 52905 }, { "epoch": 7.880548108430146, "grad_norm": 0.24673810601234436, "learning_rate": 6.536267812251928e-06, "loss": 0.8034, "num_input_tokens_seen": 30523360, "step": 52910 }, { "epoch": 7.8812928209711055, "grad_norm": 0.21954552829265594, "learning_rate": 6.531886919589089e-06, "loss": 0.8173, "num_input_tokens_seen": 30526368, "step": 52915 }, { "epoch": 7.882037533512064, "grad_norm": 0.2167195975780487, "learning_rate": 6.5275072749262395e-06, "loss": 0.8305, "num_input_tokens_seen": 30529600, "step": 52920 }, { "epoch": 7.882782246053024, "grad_norm": 0.20478203892707825, "learning_rate": 6.523128878559351e-06, "loss": 0.8082, "num_input_tokens_seen": 30532640, "step": 52925 }, { "epoch": 7.883526958593983, "grad_norm": 0.1951398253440857, "learning_rate": 6.518751730784284e-06, "loss": 0.7698, "num_input_tokens_seen": 30535488, "step": 52930 }, { "epoch": 7.884271671134941, "grad_norm": 0.2807459831237793, "learning_rate": 6.514375831896835e-06, "loss": 0.8042, "num_input_tokens_seen": 30538592, "step": 52935 }, { "epoch": 7.885016383675901, "grad_norm": 0.27703723311424255, "learning_rate": 6.51000118219271e-06, "loss": 0.7806, "num_input_tokens_seen": 30541696, "step": 52940 }, { "epoch": 7.885761096216861, "grad_norm": 0.2997444272041321, "learning_rate": 6.505627781967533e-06, "loss": 0.8051, "num_input_tokens_seen": 30544416, "step": 52945 }, { "epoch": 7.8865058087578195, "grad_norm": 0.2645801901817322, "learning_rate": 6.501255631516842e-06, "loss": 0.7995, "num_input_tokens_seen": 30547040, "step": 52950 }, { "epoch": 7.887250521298778, "grad_norm": 0.21629510819911957, "learning_rate": 6.4968847311360794e-06, "loss": 0.8139, "num_input_tokens_seen": 30549760, "step": 52955 }, { "epoch": 7.887995233839738, "grad_norm": 0.24920466542243958, "learning_rate": 6.4925150811206176e-06, "loss": 0.8305, "num_input_tokens_seen": 30552384, "step": 52960 }, { "epoch": 7.8887399463806975, "grad_norm": 0.19792476296424866, "learning_rate": 6.48814668176575e-06, "loss": 0.8188, "num_input_tokens_seen": 30555136, "step": 52965 }, { "epoch": 7.889484658921656, "grad_norm": 0.3016892075538635, "learning_rate": 6.483779533366654e-06, "loss": 0.8002, "num_input_tokens_seen": 30558496, "step": 52970 }, { "epoch": 7.890229371462615, "grad_norm": 0.21133236587047577, "learning_rate": 6.479413636218459e-06, "loss": 0.7867, "num_input_tokens_seen": 30561376, "step": 52975 }, { "epoch": 7.890974084003575, "grad_norm": 0.19291242957115173, "learning_rate": 6.4750489906162e-06, "loss": 0.7942, "num_input_tokens_seen": 30564128, "step": 52980 }, { "epoch": 7.8917187965445335, "grad_norm": 0.21443940699100494, "learning_rate": 6.470685596854803e-06, "loss": 0.8071, "num_input_tokens_seen": 30567008, "step": 52985 }, { "epoch": 7.892463509085493, "grad_norm": 0.17663301527500153, "learning_rate": 6.46632345522914e-06, "loss": 0.8234, "num_input_tokens_seen": 30569856, "step": 52990 }, { "epoch": 7.893208221626452, "grad_norm": 0.23967793583869934, "learning_rate": 6.461962566033986e-06, "loss": 0.8317, "num_input_tokens_seen": 30572512, "step": 52995 }, { "epoch": 7.8939529341674115, "grad_norm": 0.2637522518634796, "learning_rate": 6.45760292956403e-06, "loss": 0.8103, "num_input_tokens_seen": 30575328, "step": 53000 }, { "epoch": 7.89469764670837, "grad_norm": 0.29450544714927673, "learning_rate": 6.45324454611389e-06, "loss": 0.779, "num_input_tokens_seen": 30578144, "step": 53005 }, { "epoch": 7.89544235924933, "grad_norm": 0.1670936793088913, "learning_rate": 6.448887415978069e-06, "loss": 0.8209, "num_input_tokens_seen": 30581184, "step": 53010 }, { "epoch": 7.896187071790289, "grad_norm": 0.26809418201446533, "learning_rate": 6.4445315394510205e-06, "loss": 0.808, "num_input_tokens_seen": 30584160, "step": 53015 }, { "epoch": 7.896931784331248, "grad_norm": 0.2220771759748459, "learning_rate": 6.440176916827081e-06, "loss": 0.7931, "num_input_tokens_seen": 30586880, "step": 53020 }, { "epoch": 7.897676496872207, "grad_norm": 0.2238263338804245, "learning_rate": 6.435823548400529e-06, "loss": 0.825, "num_input_tokens_seen": 30589472, "step": 53025 }, { "epoch": 7.898421209413167, "grad_norm": 0.30133792757987976, "learning_rate": 6.431471434465544e-06, "loss": 0.7948, "num_input_tokens_seen": 30592512, "step": 53030 }, { "epoch": 7.8991659219541255, "grad_norm": 0.21440313756465912, "learning_rate": 6.427120575316226e-06, "loss": 0.7984, "num_input_tokens_seen": 30595328, "step": 53035 }, { "epoch": 7.899910634495085, "grad_norm": 0.20350885391235352, "learning_rate": 6.422770971246586e-06, "loss": 0.773, "num_input_tokens_seen": 30598336, "step": 53040 }, { "epoch": 7.900655347036044, "grad_norm": 0.1756296157836914, "learning_rate": 6.4184226225505625e-06, "loss": 0.8057, "num_input_tokens_seen": 30601184, "step": 53045 }, { "epoch": 7.9014000595770035, "grad_norm": 0.18585409224033356, "learning_rate": 6.414075529521993e-06, "loss": 0.8093, "num_input_tokens_seen": 30604128, "step": 53050 }, { "epoch": 7.902144772117962, "grad_norm": 0.26183468103408813, "learning_rate": 6.409729692454625e-06, "loss": 0.7915, "num_input_tokens_seen": 30606880, "step": 53055 }, { "epoch": 7.902889484658922, "grad_norm": 0.15292678773403168, "learning_rate": 6.4053851116421395e-06, "loss": 0.7929, "num_input_tokens_seen": 30609888, "step": 53060 }, { "epoch": 7.903634197199881, "grad_norm": 0.25379204750061035, "learning_rate": 6.401041787378131e-06, "loss": 0.7926, "num_input_tokens_seen": 30612768, "step": 53065 }, { "epoch": 7.90437890974084, "grad_norm": 0.19888202846050262, "learning_rate": 6.396699719956101e-06, "loss": 0.8049, "num_input_tokens_seen": 30615584, "step": 53070 }, { "epoch": 7.905123622281799, "grad_norm": 0.2260242998600006, "learning_rate": 6.3923589096694685e-06, "loss": 0.8189, "num_input_tokens_seen": 30618464, "step": 53075 }, { "epoch": 7.905868334822759, "grad_norm": 0.16984660923480988, "learning_rate": 6.388019356811573e-06, "loss": 0.8003, "num_input_tokens_seen": 30621536, "step": 53080 }, { "epoch": 7.9066130473637175, "grad_norm": 0.2928066849708557, "learning_rate": 6.3836810616756614e-06, "loss": 0.7578, "num_input_tokens_seen": 30624448, "step": 53085 }, { "epoch": 7.907357759904677, "grad_norm": 0.2275884598493576, "learning_rate": 6.379344024554884e-06, "loss": 0.8294, "num_input_tokens_seen": 30627424, "step": 53090 }, { "epoch": 7.908102472445636, "grad_norm": 0.23459243774414062, "learning_rate": 6.375008245742334e-06, "loss": 0.7663, "num_input_tokens_seen": 30630080, "step": 53095 }, { "epoch": 7.908847184986596, "grad_norm": 0.21033066511154175, "learning_rate": 6.370673725531004e-06, "loss": 0.798, "num_input_tokens_seen": 30632928, "step": 53100 }, { "epoch": 7.909591897527554, "grad_norm": 0.20868058502674103, "learning_rate": 6.366340464213799e-06, "loss": 0.7976, "num_input_tokens_seen": 30635712, "step": 53105 }, { "epoch": 7.910336610068514, "grad_norm": 0.37770920991897583, "learning_rate": 6.3620084620835494e-06, "loss": 0.831, "num_input_tokens_seen": 30638528, "step": 53110 }, { "epoch": 7.911081322609473, "grad_norm": 0.18836651742458344, "learning_rate": 6.357677719432998e-06, "loss": 0.786, "num_input_tokens_seen": 30641408, "step": 53115 }, { "epoch": 7.9118260351504315, "grad_norm": 0.23841296136379242, "learning_rate": 6.353348236554784e-06, "loss": 0.7939, "num_input_tokens_seen": 30644320, "step": 53120 }, { "epoch": 7.912570747691391, "grad_norm": 0.2065167874097824, "learning_rate": 6.349020013741491e-06, "loss": 0.8077, "num_input_tokens_seen": 30647200, "step": 53125 }, { "epoch": 7.913315460232351, "grad_norm": 0.20906715095043182, "learning_rate": 6.3446930512855914e-06, "loss": 0.8165, "num_input_tokens_seen": 30649984, "step": 53130 }, { "epoch": 7.9140601727733095, "grad_norm": 0.23799388110637665, "learning_rate": 6.340367349479487e-06, "loss": 0.8129, "num_input_tokens_seen": 30652896, "step": 53135 }, { "epoch": 7.914804885314268, "grad_norm": 0.20537827908992767, "learning_rate": 6.336042908615492e-06, "loss": 0.8121, "num_input_tokens_seen": 30655776, "step": 53140 }, { "epoch": 7.915549597855228, "grad_norm": 0.24320025742053986, "learning_rate": 6.331719728985844e-06, "loss": 0.822, "num_input_tokens_seen": 30658560, "step": 53145 }, { "epoch": 7.916294310396187, "grad_norm": 0.1681012362241745, "learning_rate": 6.3273978108826685e-06, "loss": 0.8123, "num_input_tokens_seen": 30661344, "step": 53150 }, { "epoch": 7.917039022937146, "grad_norm": 0.22022400796413422, "learning_rate": 6.323077154598031e-06, "loss": 0.7898, "num_input_tokens_seen": 30663968, "step": 53155 }, { "epoch": 7.917783735478105, "grad_norm": 0.17803114652633667, "learning_rate": 6.3187577604239074e-06, "loss": 0.8083, "num_input_tokens_seen": 30666880, "step": 53160 }, { "epoch": 7.918528448019065, "grad_norm": 0.2219811975955963, "learning_rate": 6.314439628652186e-06, "loss": 0.8085, "num_input_tokens_seen": 30669632, "step": 53165 }, { "epoch": 7.9192731605600235, "grad_norm": 0.19839824736118317, "learning_rate": 6.31012275957466e-06, "loss": 0.8005, "num_input_tokens_seen": 30672768, "step": 53170 }, { "epoch": 7.920017873100983, "grad_norm": 0.21786490082740784, "learning_rate": 6.305807153483048e-06, "loss": 0.8293, "num_input_tokens_seen": 30675648, "step": 53175 }, { "epoch": 7.920762585641942, "grad_norm": 0.18764615058898926, "learning_rate": 6.3014928106689905e-06, "loss": 0.7942, "num_input_tokens_seen": 30678528, "step": 53180 }, { "epoch": 7.921507298182902, "grad_norm": 0.17193067073822021, "learning_rate": 6.297179731424022e-06, "loss": 0.7913, "num_input_tokens_seen": 30681184, "step": 53185 }, { "epoch": 7.92225201072386, "grad_norm": 0.23692409694194794, "learning_rate": 6.292867916039605e-06, "loss": 0.8192, "num_input_tokens_seen": 30684192, "step": 53190 }, { "epoch": 7.92299672326482, "grad_norm": 0.28165844082832336, "learning_rate": 6.288557364807118e-06, "loss": 0.813, "num_input_tokens_seen": 30687424, "step": 53195 }, { "epoch": 7.923741435805779, "grad_norm": 0.2716763913631439, "learning_rate": 6.284248078017846e-06, "loss": 0.8337, "num_input_tokens_seen": 30690240, "step": 53200 }, { "epoch": 7.924486148346738, "grad_norm": 0.21586336195468903, "learning_rate": 6.279940055963007e-06, "loss": 0.8084, "num_input_tokens_seen": 30693088, "step": 53205 }, { "epoch": 7.925230860887697, "grad_norm": 0.19522498548030853, "learning_rate": 6.2756332989337005e-06, "loss": 0.8001, "num_input_tokens_seen": 30695840, "step": 53210 }, { "epoch": 7.925975573428657, "grad_norm": 0.19898170232772827, "learning_rate": 6.271327807220975e-06, "loss": 0.7966, "num_input_tokens_seen": 30698720, "step": 53215 }, { "epoch": 7.9267202859696155, "grad_norm": 0.28346049785614014, "learning_rate": 6.267023581115763e-06, "loss": 0.8207, "num_input_tokens_seen": 30701696, "step": 53220 }, { "epoch": 7.927464998510575, "grad_norm": 0.1989450752735138, "learning_rate": 6.262720620908935e-06, "loss": 0.7902, "num_input_tokens_seen": 30704480, "step": 53225 }, { "epoch": 7.928209711051534, "grad_norm": 0.20045626163482666, "learning_rate": 6.258418926891269e-06, "loss": 0.8098, "num_input_tokens_seen": 30707392, "step": 53230 }, { "epoch": 7.928954423592494, "grad_norm": 0.2757991850376129, "learning_rate": 6.254118499353451e-06, "loss": 0.7768, "num_input_tokens_seen": 30710304, "step": 53235 }, { "epoch": 7.929699136133452, "grad_norm": 0.3236831724643707, "learning_rate": 6.249819338586091e-06, "loss": 0.8245, "num_input_tokens_seen": 30713184, "step": 53240 }, { "epoch": 7.930443848674412, "grad_norm": 0.21488265693187714, "learning_rate": 6.245521444879715e-06, "loss": 0.7989, "num_input_tokens_seen": 30716128, "step": 53245 }, { "epoch": 7.931188561215371, "grad_norm": 0.26266565918922424, "learning_rate": 6.241224818524749e-06, "loss": 0.8169, "num_input_tokens_seen": 30718912, "step": 53250 }, { "epoch": 7.93193327375633, "grad_norm": 0.22721277177333832, "learning_rate": 6.236929459811536e-06, "loss": 0.7907, "num_input_tokens_seen": 30721856, "step": 53255 }, { "epoch": 7.932677986297289, "grad_norm": 0.3050399720668793, "learning_rate": 6.232635369030346e-06, "loss": 0.7982, "num_input_tokens_seen": 30724960, "step": 53260 }, { "epoch": 7.933422698838249, "grad_norm": 0.2266434133052826, "learning_rate": 6.228342546471353e-06, "loss": 0.8253, "num_input_tokens_seen": 30727808, "step": 53265 }, { "epoch": 7.934167411379208, "grad_norm": 0.18990390002727509, "learning_rate": 6.224050992424652e-06, "loss": 0.7934, "num_input_tokens_seen": 30730880, "step": 53270 }, { "epoch": 7.934912123920167, "grad_norm": 0.19294588267803192, "learning_rate": 6.219760707180244e-06, "loss": 0.7878, "num_input_tokens_seen": 30733632, "step": 53275 }, { "epoch": 7.935656836461126, "grad_norm": 0.25891610980033875, "learning_rate": 6.215471691028063e-06, "loss": 0.7978, "num_input_tokens_seen": 30736448, "step": 53280 }, { "epoch": 7.936401549002085, "grad_norm": 0.2992374002933502, "learning_rate": 6.2111839442579335e-06, "loss": 0.7911, "num_input_tokens_seen": 30739392, "step": 53285 }, { "epoch": 7.937146261543044, "grad_norm": 0.29744279384613037, "learning_rate": 6.206897467159595e-06, "loss": 0.7867, "num_input_tokens_seen": 30742720, "step": 53290 }, { "epoch": 7.937890974084004, "grad_norm": 0.19162799417972565, "learning_rate": 6.20261226002272e-06, "loss": 0.7895, "num_input_tokens_seen": 30745344, "step": 53295 }, { "epoch": 7.938635686624963, "grad_norm": 0.172207772731781, "learning_rate": 6.198328323136881e-06, "loss": 0.8337, "num_input_tokens_seen": 30748128, "step": 53300 }, { "epoch": 7.9393803991659215, "grad_norm": 0.32740986347198486, "learning_rate": 6.1940456567915725e-06, "loss": 0.785, "num_input_tokens_seen": 30751264, "step": 53305 }, { "epoch": 7.940125111706881, "grad_norm": 0.21964697539806366, "learning_rate": 6.189764261276207e-06, "loss": 0.8216, "num_input_tokens_seen": 30754016, "step": 53310 }, { "epoch": 7.940869824247841, "grad_norm": 0.22519806027412415, "learning_rate": 6.185484136880088e-06, "loss": 0.7966, "num_input_tokens_seen": 30756800, "step": 53315 }, { "epoch": 7.9416145367888, "grad_norm": 0.18024401366710663, "learning_rate": 6.181205283892458e-06, "loss": 0.7789, "num_input_tokens_seen": 30759616, "step": 53320 }, { "epoch": 7.942359249329758, "grad_norm": 0.25736933946609497, "learning_rate": 6.1769277026024615e-06, "loss": 0.8131, "num_input_tokens_seen": 30762496, "step": 53325 }, { "epoch": 7.943103961870718, "grad_norm": 0.20010493695735931, "learning_rate": 6.1726513932991724e-06, "loss": 0.8017, "num_input_tokens_seen": 30765344, "step": 53330 }, { "epoch": 7.943848674411677, "grad_norm": 0.2612978518009186, "learning_rate": 6.168376356271546e-06, "loss": 0.7751, "num_input_tokens_seen": 30768160, "step": 53335 }, { "epoch": 7.944593386952636, "grad_norm": 0.2589346170425415, "learning_rate": 6.1641025918084825e-06, "loss": 0.8024, "num_input_tokens_seen": 30771328, "step": 53340 }, { "epoch": 7.945338099493595, "grad_norm": 0.15933899581432343, "learning_rate": 6.15983010019879e-06, "loss": 0.8016, "num_input_tokens_seen": 30774080, "step": 53345 }, { "epoch": 7.946082812034555, "grad_norm": 0.2281970977783203, "learning_rate": 6.155558881731174e-06, "loss": 0.7943, "num_input_tokens_seen": 30776768, "step": 53350 }, { "epoch": 7.946827524575514, "grad_norm": 0.24075327813625336, "learning_rate": 6.151288936694274e-06, "loss": 0.8176, "num_input_tokens_seen": 30779520, "step": 53355 }, { "epoch": 7.947572237116473, "grad_norm": 0.23556916415691376, "learning_rate": 6.147020265376635e-06, "loss": 0.7948, "num_input_tokens_seen": 30782464, "step": 53360 }, { "epoch": 7.948316949657432, "grad_norm": 0.29897722601890564, "learning_rate": 6.1427528680667144e-06, "loss": 0.8035, "num_input_tokens_seen": 30785248, "step": 53365 }, { "epoch": 7.949061662198392, "grad_norm": 0.26986098289489746, "learning_rate": 6.138486745052896e-06, "loss": 0.8291, "num_input_tokens_seen": 30788256, "step": 53370 }, { "epoch": 7.94980637473935, "grad_norm": 0.1601824313402176, "learning_rate": 6.134221896623449e-06, "loss": 0.7971, "num_input_tokens_seen": 30791168, "step": 53375 }, { "epoch": 7.95055108728031, "grad_norm": 0.21513554453849792, "learning_rate": 6.129958323066592e-06, "loss": 0.8089, "num_input_tokens_seen": 30794304, "step": 53380 }, { "epoch": 7.951295799821269, "grad_norm": 0.22774267196655273, "learning_rate": 6.1256960246704245e-06, "loss": 0.7916, "num_input_tokens_seen": 30797216, "step": 53385 }, { "epoch": 7.952040512362228, "grad_norm": 0.16516095399856567, "learning_rate": 6.1214350017229805e-06, "loss": 0.804, "num_input_tokens_seen": 30800064, "step": 53390 }, { "epoch": 7.952785224903187, "grad_norm": 0.3104003071784973, "learning_rate": 6.117175254512206e-06, "loss": 0.8173, "num_input_tokens_seen": 30803008, "step": 53395 }, { "epoch": 7.953529937444147, "grad_norm": 0.21823543310165405, "learning_rate": 6.1129167833259535e-06, "loss": 0.798, "num_input_tokens_seen": 30805888, "step": 53400 }, { "epoch": 7.954274649985106, "grad_norm": 0.18483024835586548, "learning_rate": 6.108659588451998e-06, "loss": 0.8259, "num_input_tokens_seen": 30809024, "step": 53405 }, { "epoch": 7.955019362526065, "grad_norm": 0.17847740650177002, "learning_rate": 6.104403670178027e-06, "loss": 0.8011, "num_input_tokens_seen": 30811936, "step": 53410 }, { "epoch": 7.955764075067024, "grad_norm": 0.30547016859054565, "learning_rate": 6.1001490287916326e-06, "loss": 0.8237, "num_input_tokens_seen": 30814784, "step": 53415 }, { "epoch": 7.956508787607984, "grad_norm": 0.2391061633825302, "learning_rate": 6.095895664580317e-06, "loss": 0.7869, "num_input_tokens_seen": 30817504, "step": 53420 }, { "epoch": 7.957253500148942, "grad_norm": 0.2744886875152588, "learning_rate": 6.0916435778315156e-06, "loss": 0.8048, "num_input_tokens_seen": 30820512, "step": 53425 }, { "epoch": 7.957998212689902, "grad_norm": 0.24796131253242493, "learning_rate": 6.087392768832567e-06, "loss": 0.798, "num_input_tokens_seen": 30823264, "step": 53430 }, { "epoch": 7.958742925230861, "grad_norm": 0.3008146584033966, "learning_rate": 6.08314323787072e-06, "loss": 0.8089, "num_input_tokens_seen": 30826112, "step": 53435 }, { "epoch": 7.9594876377718204, "grad_norm": 0.18431499600410461, "learning_rate": 6.078894985233141e-06, "loss": 0.7873, "num_input_tokens_seen": 30829216, "step": 53440 }, { "epoch": 7.960232350312779, "grad_norm": 0.21755734086036682, "learning_rate": 6.074648011206921e-06, "loss": 0.7966, "num_input_tokens_seen": 30831840, "step": 53445 }, { "epoch": 7.960977062853738, "grad_norm": 0.22136883437633514, "learning_rate": 6.070402316079043e-06, "loss": 0.7819, "num_input_tokens_seen": 30834656, "step": 53450 }, { "epoch": 7.961721775394698, "grad_norm": 0.21086707711219788, "learning_rate": 6.066157900136407e-06, "loss": 0.7972, "num_input_tokens_seen": 30837280, "step": 53455 }, { "epoch": 7.962466487935657, "grad_norm": 0.2537926137447357, "learning_rate": 6.0619147636658405e-06, "loss": 0.8024, "num_input_tokens_seen": 30840288, "step": 53460 }, { "epoch": 7.963211200476616, "grad_norm": 0.14818108081817627, "learning_rate": 6.057672906954076e-06, "loss": 0.8019, "num_input_tokens_seen": 30843328, "step": 53465 }, { "epoch": 7.963955913017575, "grad_norm": 0.30507510900497437, "learning_rate": 6.053432330287765e-06, "loss": 0.7964, "num_input_tokens_seen": 30846592, "step": 53470 }, { "epoch": 7.964700625558534, "grad_norm": 0.20824451744556427, "learning_rate": 6.049193033953474e-06, "loss": 0.7895, "num_input_tokens_seen": 30849536, "step": 53475 }, { "epoch": 7.965445338099494, "grad_norm": 0.2268793284893036, "learning_rate": 6.044955018237661e-06, "loss": 0.7741, "num_input_tokens_seen": 30852992, "step": 53480 }, { "epoch": 7.966190050640453, "grad_norm": 0.23389127850532532, "learning_rate": 6.040718283426722e-06, "loss": 0.8061, "num_input_tokens_seen": 30855872, "step": 53485 }, { "epoch": 7.966934763181412, "grad_norm": 0.1330021768808365, "learning_rate": 6.036482829806964e-06, "loss": 0.7869, "num_input_tokens_seen": 30858624, "step": 53490 }, { "epoch": 7.967679475722371, "grad_norm": 0.20325316488742828, "learning_rate": 6.032248657664591e-06, "loss": 0.8251, "num_input_tokens_seen": 30861760, "step": 53495 }, { "epoch": 7.96842418826333, "grad_norm": 0.44451719522476196, "learning_rate": 6.028015767285735e-06, "loss": 0.7831, "num_input_tokens_seen": 30864896, "step": 53500 }, { "epoch": 7.96916890080429, "grad_norm": 0.24833561480045319, "learning_rate": 6.023784158956442e-06, "loss": 0.8075, "num_input_tokens_seen": 30867712, "step": 53505 }, { "epoch": 7.969913613345248, "grad_norm": 0.18127602338790894, "learning_rate": 6.019553832962668e-06, "loss": 0.8091, "num_input_tokens_seen": 30870336, "step": 53510 }, { "epoch": 7.970658325886208, "grad_norm": 0.3140379786491394, "learning_rate": 6.015324789590271e-06, "loss": 0.771, "num_input_tokens_seen": 30873760, "step": 53515 }, { "epoch": 7.971403038427167, "grad_norm": 0.21612133085727692, "learning_rate": 6.01109702912504e-06, "loss": 0.7971, "num_input_tokens_seen": 30876480, "step": 53520 }, { "epoch": 7.9721477509681264, "grad_norm": 0.25255343317985535, "learning_rate": 6.006870551852667e-06, "loss": 0.7861, "num_input_tokens_seen": 30879104, "step": 53525 }, { "epoch": 7.972892463509085, "grad_norm": 0.20286673307418823, "learning_rate": 6.00264535805877e-06, "loss": 0.8119, "num_input_tokens_seen": 30881984, "step": 53530 }, { "epoch": 7.973637176050045, "grad_norm": 0.1863037794828415, "learning_rate": 5.998421448028854e-06, "loss": 0.8137, "num_input_tokens_seen": 30885024, "step": 53535 }, { "epoch": 7.974381888591004, "grad_norm": 0.24951253831386566, "learning_rate": 5.994198822048361e-06, "loss": 0.7778, "num_input_tokens_seen": 30888096, "step": 53540 }, { "epoch": 7.975126601131963, "grad_norm": 0.21058841049671173, "learning_rate": 5.989977480402648e-06, "loss": 0.8142, "num_input_tokens_seen": 30890880, "step": 53545 }, { "epoch": 7.975871313672922, "grad_norm": 0.1847933679819107, "learning_rate": 5.985757423376962e-06, "loss": 0.7793, "num_input_tokens_seen": 30893568, "step": 53550 }, { "epoch": 7.976616026213882, "grad_norm": 0.24251946806907654, "learning_rate": 5.98153865125648e-06, "loss": 0.7791, "num_input_tokens_seen": 30896384, "step": 53555 }, { "epoch": 7.97736073875484, "grad_norm": 0.22917787730693817, "learning_rate": 5.977321164326294e-06, "loss": 0.8279, "num_input_tokens_seen": 30899328, "step": 53560 }, { "epoch": 7.9781054512958, "grad_norm": 0.236185684800148, "learning_rate": 5.973104962871403e-06, "loss": 0.7922, "num_input_tokens_seen": 30902176, "step": 53565 }, { "epoch": 7.978850163836759, "grad_norm": 0.1951328068971634, "learning_rate": 5.968890047176728e-06, "loss": 0.8114, "num_input_tokens_seen": 30904800, "step": 53570 }, { "epoch": 7.9795948763777185, "grad_norm": 0.1712806522846222, "learning_rate": 5.964676417527082e-06, "loss": 0.7922, "num_input_tokens_seen": 30907872, "step": 53575 }, { "epoch": 7.980339588918677, "grad_norm": 0.23420405387878418, "learning_rate": 5.960464074207217e-06, "loss": 0.8035, "num_input_tokens_seen": 30910464, "step": 53580 }, { "epoch": 7.981084301459637, "grad_norm": 0.2455923855304718, "learning_rate": 5.956253017501776e-06, "loss": 0.805, "num_input_tokens_seen": 30913312, "step": 53585 }, { "epoch": 7.981829014000596, "grad_norm": 0.2330782413482666, "learning_rate": 5.95204324769533e-06, "loss": 0.7979, "num_input_tokens_seen": 30916160, "step": 53590 }, { "epoch": 7.982573726541555, "grad_norm": 0.1936585158109665, "learning_rate": 5.947834765072355e-06, "loss": 0.7968, "num_input_tokens_seen": 30918688, "step": 53595 }, { "epoch": 7.983318439082514, "grad_norm": 0.2843545973300934, "learning_rate": 5.943627569917248e-06, "loss": 0.7753, "num_input_tokens_seen": 30921824, "step": 53600 }, { "epoch": 7.984063151623474, "grad_norm": 0.20769548416137695, "learning_rate": 5.939421662514314e-06, "loss": 0.8072, "num_input_tokens_seen": 30924704, "step": 53605 }, { "epoch": 7.9848078641644324, "grad_norm": 0.17614950239658356, "learning_rate": 5.9352170431477755e-06, "loss": 0.7977, "num_input_tokens_seen": 30927552, "step": 53610 }, { "epoch": 7.985552576705392, "grad_norm": 0.2639138698577881, "learning_rate": 5.931013712101754e-06, "loss": 0.79, "num_input_tokens_seen": 30930432, "step": 53615 }, { "epoch": 7.986297289246351, "grad_norm": 0.19095081090927124, "learning_rate": 5.926811669660296e-06, "loss": 0.7911, "num_input_tokens_seen": 30933376, "step": 53620 }, { "epoch": 7.9870420017873105, "grad_norm": 0.1337626427412033, "learning_rate": 5.922610916107355e-06, "loss": 0.7816, "num_input_tokens_seen": 30936096, "step": 53625 }, { "epoch": 7.987786714328269, "grad_norm": 0.24813176691532135, "learning_rate": 5.918411451726804e-06, "loss": 0.7879, "num_input_tokens_seen": 30938816, "step": 53630 }, { "epoch": 7.988531426869228, "grad_norm": 0.31973135471343994, "learning_rate": 5.91421327680243e-06, "loss": 0.8155, "num_input_tokens_seen": 30941600, "step": 53635 }, { "epoch": 7.989276139410188, "grad_norm": 0.19195911288261414, "learning_rate": 5.910016391617934e-06, "loss": 0.7853, "num_input_tokens_seen": 30944608, "step": 53640 }, { "epoch": 7.990020851951147, "grad_norm": 0.18549837172031403, "learning_rate": 5.905820796456906e-06, "loss": 0.8166, "num_input_tokens_seen": 30947424, "step": 53645 }, { "epoch": 7.990765564492106, "grad_norm": 0.22332626581192017, "learning_rate": 5.901626491602885e-06, "loss": 0.8104, "num_input_tokens_seen": 30950176, "step": 53650 }, { "epoch": 7.991510277033065, "grad_norm": 0.21158085763454437, "learning_rate": 5.8974334773392924e-06, "loss": 0.8131, "num_input_tokens_seen": 30953216, "step": 53655 }, { "epoch": 7.9922549895740245, "grad_norm": 0.1832137554883957, "learning_rate": 5.893241753949477e-06, "loss": 0.8024, "num_input_tokens_seen": 30956000, "step": 53660 }, { "epoch": 7.992999702114983, "grad_norm": 0.21045182645320892, "learning_rate": 5.889051321716702e-06, "loss": 0.7928, "num_input_tokens_seen": 30958944, "step": 53665 }, { "epoch": 7.993744414655943, "grad_norm": 0.3165088891983032, "learning_rate": 5.8848621809241415e-06, "loss": 0.8348, "num_input_tokens_seen": 30961984, "step": 53670 }, { "epoch": 7.994489127196902, "grad_norm": 0.33137840032577515, "learning_rate": 5.880674331854882e-06, "loss": 0.79, "num_input_tokens_seen": 30964928, "step": 53675 }, { "epoch": 7.995233839737861, "grad_norm": 0.27334532141685486, "learning_rate": 5.876487774791914e-06, "loss": 0.837, "num_input_tokens_seen": 30967552, "step": 53680 }, { "epoch": 7.99597855227882, "grad_norm": 0.1959219127893448, "learning_rate": 5.872302510018149e-06, "loss": 0.8005, "num_input_tokens_seen": 30970528, "step": 53685 }, { "epoch": 7.99672326481978, "grad_norm": 0.19195272028446198, "learning_rate": 5.8681185378164224e-06, "loss": 0.8184, "num_input_tokens_seen": 30973600, "step": 53690 }, { "epoch": 7.9974679773607384, "grad_norm": 0.22676025331020355, "learning_rate": 5.863935858469452e-06, "loss": 0.7854, "num_input_tokens_seen": 30976320, "step": 53695 }, { "epoch": 7.998212689901698, "grad_norm": 0.22067704796791077, "learning_rate": 5.859754472259893e-06, "loss": 0.814, "num_input_tokens_seen": 30978880, "step": 53700 }, { "epoch": 7.998957402442657, "grad_norm": 0.2445594221353531, "learning_rate": 5.855574379470311e-06, "loss": 0.8196, "num_input_tokens_seen": 30981632, "step": 53705 }, { "epoch": 7.9997021149836165, "grad_norm": 0.2806633710861206, "learning_rate": 5.851395580383182e-06, "loss": 0.7946, "num_input_tokens_seen": 30984608, "step": 53710 }, { "epoch": 8.0, "eval_loss": 0.8029014468193054, "eval_runtime": 45.2789, "eval_samples_per_second": 65.903, "eval_steps_per_second": 16.476, "num_input_tokens_seen": 30985288, "step": 53712 }, { "epoch": 8.000446827524575, "grad_norm": 0.22584378719329834, "learning_rate": 5.8472180752808805e-06, "loss": 0.8, "num_input_tokens_seen": 30986824, "step": 53715 }, { "epoch": 8.001191540065534, "grad_norm": 0.20365740358829498, "learning_rate": 5.843041864445714e-06, "loss": 0.7786, "num_input_tokens_seen": 30989800, "step": 53720 }, { "epoch": 8.001936252606495, "grad_norm": 0.20301225781440735, "learning_rate": 5.838866948159888e-06, "loss": 0.7826, "num_input_tokens_seen": 30992520, "step": 53725 }, { "epoch": 8.002680965147453, "grad_norm": 0.20712688565254211, "learning_rate": 5.8346933267055394e-06, "loss": 0.7811, "num_input_tokens_seen": 30995336, "step": 53730 }, { "epoch": 8.003425677688412, "grad_norm": 0.27106431126594543, "learning_rate": 5.830521000364689e-06, "loss": 0.7645, "num_input_tokens_seen": 30998312, "step": 53735 }, { "epoch": 8.00417039022937, "grad_norm": 0.3692930340766907, "learning_rate": 5.826349969419292e-06, "loss": 0.8055, "num_input_tokens_seen": 31001480, "step": 53740 }, { "epoch": 8.004915102770331, "grad_norm": 0.19910690188407898, "learning_rate": 5.822180234151214e-06, "loss": 0.7989, "num_input_tokens_seen": 31004232, "step": 53745 }, { "epoch": 8.00565981531129, "grad_norm": 0.21593724191188812, "learning_rate": 5.818011794842221e-06, "loss": 0.8172, "num_input_tokens_seen": 31007176, "step": 53750 }, { "epoch": 8.006404527852249, "grad_norm": 0.3613351583480835, "learning_rate": 5.8138446517740005e-06, "loss": 0.8187, "num_input_tokens_seen": 31010184, "step": 53755 }, { "epoch": 8.007149240393208, "grad_norm": 0.1981191784143448, "learning_rate": 5.809678805228152e-06, "loss": 0.7882, "num_input_tokens_seen": 31013192, "step": 53760 }, { "epoch": 8.007893952934168, "grad_norm": 0.2811751365661621, "learning_rate": 5.805514255486191e-06, "loss": 0.8225, "num_input_tokens_seen": 31016072, "step": 53765 }, { "epoch": 8.008638665475127, "grad_norm": 0.2150801122188568, "learning_rate": 5.801351002829542e-06, "loss": 0.8085, "num_input_tokens_seen": 31018952, "step": 53770 }, { "epoch": 8.009383378016086, "grad_norm": 0.2039414346218109, "learning_rate": 5.797189047539531e-06, "loss": 0.8069, "num_input_tokens_seen": 31021864, "step": 53775 }, { "epoch": 8.010128090557044, "grad_norm": 0.22618559002876282, "learning_rate": 5.793028389897418e-06, "loss": 0.7741, "num_input_tokens_seen": 31024648, "step": 53780 }, { "epoch": 8.010872803098005, "grad_norm": 0.23334753513336182, "learning_rate": 5.788869030184346e-06, "loss": 0.7933, "num_input_tokens_seen": 31027528, "step": 53785 }, { "epoch": 8.011617515638964, "grad_norm": 0.23235948383808136, "learning_rate": 5.784710968681403e-06, "loss": 0.8266, "num_input_tokens_seen": 31030248, "step": 53790 }, { "epoch": 8.012362228179922, "grad_norm": 0.19149340689182281, "learning_rate": 5.780554205669567e-06, "loss": 0.813, "num_input_tokens_seen": 31033000, "step": 53795 }, { "epoch": 8.013106940720881, "grad_norm": 0.2119636982679367, "learning_rate": 5.776398741429737e-06, "loss": 0.7934, "num_input_tokens_seen": 31035848, "step": 53800 }, { "epoch": 8.013851653261842, "grad_norm": 0.21375980973243713, "learning_rate": 5.77224457624273e-06, "loss": 0.7733, "num_input_tokens_seen": 31038952, "step": 53805 }, { "epoch": 8.0145963658028, "grad_norm": 0.17255304753780365, "learning_rate": 5.768091710389254e-06, "loss": 0.7814, "num_input_tokens_seen": 31042088, "step": 53810 }, { "epoch": 8.01534107834376, "grad_norm": 0.1938437521457672, "learning_rate": 5.763940144149954e-06, "loss": 0.7858, "num_input_tokens_seen": 31044840, "step": 53815 }, { "epoch": 8.016085790884718, "grad_norm": 0.14915506541728973, "learning_rate": 5.759789877805363e-06, "loss": 0.8083, "num_input_tokens_seen": 31047752, "step": 53820 }, { "epoch": 8.016830503425677, "grad_norm": 0.18031716346740723, "learning_rate": 5.755640911635951e-06, "loss": 0.79, "num_input_tokens_seen": 31050824, "step": 53825 }, { "epoch": 8.017575215966637, "grad_norm": 0.2533290982246399, "learning_rate": 5.7514932459220825e-06, "loss": 0.7897, "num_input_tokens_seen": 31053672, "step": 53830 }, { "epoch": 8.018319928507596, "grad_norm": 0.2969483733177185, "learning_rate": 5.747346880944041e-06, "loss": 0.7816, "num_input_tokens_seen": 31056232, "step": 53835 }, { "epoch": 8.019064641048555, "grad_norm": 0.24505110085010529, "learning_rate": 5.743201816982027e-06, "loss": 0.7815, "num_input_tokens_seen": 31059304, "step": 53840 }, { "epoch": 8.019809353589514, "grad_norm": 0.20649418234825134, "learning_rate": 5.739058054316138e-06, "loss": 0.8101, "num_input_tokens_seen": 31062376, "step": 53845 }, { "epoch": 8.020554066130474, "grad_norm": 0.3138434886932373, "learning_rate": 5.7349155932264035e-06, "loss": 0.7644, "num_input_tokens_seen": 31065352, "step": 53850 }, { "epoch": 8.021298778671433, "grad_norm": 0.23018741607666016, "learning_rate": 5.730774433992739e-06, "loss": 0.8242, "num_input_tokens_seen": 31068168, "step": 53855 }, { "epoch": 8.022043491212392, "grad_norm": 0.23564453423023224, "learning_rate": 5.726634576894993e-06, "loss": 0.7737, "num_input_tokens_seen": 31071208, "step": 53860 }, { "epoch": 8.02278820375335, "grad_norm": 0.21239638328552246, "learning_rate": 5.7224960222129255e-06, "loss": 0.7881, "num_input_tokens_seen": 31074184, "step": 53865 }, { "epoch": 8.023532916294311, "grad_norm": 0.1659722626209259, "learning_rate": 5.718358770226201e-06, "loss": 0.7827, "num_input_tokens_seen": 31077032, "step": 53870 }, { "epoch": 8.02427762883527, "grad_norm": 0.24884022772312164, "learning_rate": 5.714222821214402e-06, "loss": 0.8111, "num_input_tokens_seen": 31079528, "step": 53875 }, { "epoch": 8.025022341376228, "grad_norm": 0.3976815938949585, "learning_rate": 5.710088175457007e-06, "loss": 0.8133, "num_input_tokens_seen": 31082408, "step": 53880 }, { "epoch": 8.025767053917187, "grad_norm": 0.19999130070209503, "learning_rate": 5.705954833233429e-06, "loss": 0.8461, "num_input_tokens_seen": 31085096, "step": 53885 }, { "epoch": 8.026511766458148, "grad_norm": 0.3012391924858093, "learning_rate": 5.701822794822978e-06, "loss": 0.8064, "num_input_tokens_seen": 31088296, "step": 53890 }, { "epoch": 8.027256478999107, "grad_norm": 0.30502480268478394, "learning_rate": 5.6976920605048885e-06, "loss": 0.8279, "num_input_tokens_seen": 31091432, "step": 53895 }, { "epoch": 8.028001191540065, "grad_norm": 0.22543083131313324, "learning_rate": 5.6935626305582855e-06, "loss": 0.7874, "num_input_tokens_seen": 31094536, "step": 53900 }, { "epoch": 8.028745904081024, "grad_norm": 0.24012118577957153, "learning_rate": 5.6894345052622255e-06, "loss": 0.8044, "num_input_tokens_seen": 31097512, "step": 53905 }, { "epoch": 8.029490616621985, "grad_norm": 0.4024931490421295, "learning_rate": 5.685307684895677e-06, "loss": 0.8005, "num_input_tokens_seen": 31100392, "step": 53910 }, { "epoch": 8.030235329162943, "grad_norm": 0.2364150434732437, "learning_rate": 5.6811821697375005e-06, "loss": 0.8331, "num_input_tokens_seen": 31103080, "step": 53915 }, { "epoch": 8.030980041703902, "grad_norm": 0.2236400544643402, "learning_rate": 5.67705796006649e-06, "loss": 0.7909, "num_input_tokens_seen": 31106344, "step": 53920 }, { "epoch": 8.03172475424486, "grad_norm": 0.31565892696380615, "learning_rate": 5.672935056161338e-06, "loss": 0.8031, "num_input_tokens_seen": 31109320, "step": 53925 }, { "epoch": 8.032469466785821, "grad_norm": 0.19659560918807983, "learning_rate": 5.668813458300659e-06, "loss": 0.8122, "num_input_tokens_seen": 31112072, "step": 53930 }, { "epoch": 8.03321417932678, "grad_norm": 0.1995856761932373, "learning_rate": 5.664693166762977e-06, "loss": 0.7896, "num_input_tokens_seen": 31114824, "step": 53935 }, { "epoch": 8.033958891867739, "grad_norm": 0.24863722920417786, "learning_rate": 5.660574181826714e-06, "loss": 0.7968, "num_input_tokens_seen": 31117640, "step": 53940 }, { "epoch": 8.034703604408698, "grad_norm": 0.19644379615783691, "learning_rate": 5.656456503770227e-06, "loss": 0.7937, "num_input_tokens_seen": 31120840, "step": 53945 }, { "epoch": 8.035448316949658, "grad_norm": 0.2592855989933014, "learning_rate": 5.652340132871756e-06, "loss": 0.8027, "num_input_tokens_seen": 31124104, "step": 53950 }, { "epoch": 8.036193029490617, "grad_norm": 0.23609212040901184, "learning_rate": 5.648225069409477e-06, "loss": 0.8347, "num_input_tokens_seen": 31126728, "step": 53955 }, { "epoch": 8.036937742031576, "grad_norm": 0.263101190328598, "learning_rate": 5.644111313661471e-06, "loss": 0.8003, "num_input_tokens_seen": 31129480, "step": 53960 }, { "epoch": 8.037682454572534, "grad_norm": 0.19502125680446625, "learning_rate": 5.639998865905724e-06, "loss": 0.784, "num_input_tokens_seen": 31132584, "step": 53965 }, { "epoch": 8.038427167113495, "grad_norm": 0.23945505917072296, "learning_rate": 5.63588772642015e-06, "loss": 0.7918, "num_input_tokens_seen": 31135848, "step": 53970 }, { "epoch": 8.039171879654454, "grad_norm": 0.22917184233665466, "learning_rate": 5.631777895482549e-06, "loss": 0.8069, "num_input_tokens_seen": 31138792, "step": 53975 }, { "epoch": 8.039916592195413, "grad_norm": 0.20426444709300995, "learning_rate": 5.627669373370658e-06, "loss": 0.8202, "num_input_tokens_seen": 31141704, "step": 53980 }, { "epoch": 8.040661304736371, "grad_norm": 0.24552114307880402, "learning_rate": 5.6235621603621004e-06, "loss": 0.7997, "num_input_tokens_seen": 31144552, "step": 53985 }, { "epoch": 8.041406017277332, "grad_norm": 0.221135213971138, "learning_rate": 5.619456256734434e-06, "loss": 0.8194, "num_input_tokens_seen": 31147560, "step": 53990 }, { "epoch": 8.04215072981829, "grad_norm": 0.3066270649433136, "learning_rate": 5.61535166276512e-06, "loss": 0.7776, "num_input_tokens_seen": 31150568, "step": 53995 }, { "epoch": 8.04289544235925, "grad_norm": 0.3190319538116455, "learning_rate": 5.611248378731526e-06, "loss": 0.8069, "num_input_tokens_seen": 31153512, "step": 54000 }, { "epoch": 8.043640154900208, "grad_norm": 0.19437634944915771, "learning_rate": 5.607146404910949e-06, "loss": 0.7697, "num_input_tokens_seen": 31156232, "step": 54005 }, { "epoch": 8.044384867441167, "grad_norm": 0.1948338896036148, "learning_rate": 5.603045741580559e-06, "loss": 0.7969, "num_input_tokens_seen": 31159048, "step": 54010 }, { "epoch": 8.045129579982127, "grad_norm": 0.24244146049022675, "learning_rate": 5.598946389017487e-06, "loss": 0.8274, "num_input_tokens_seen": 31161928, "step": 54015 }, { "epoch": 8.045874292523086, "grad_norm": 0.23529335856437683, "learning_rate": 5.59484834749873e-06, "loss": 0.7917, "num_input_tokens_seen": 31164776, "step": 54020 }, { "epoch": 8.046619005064045, "grad_norm": 0.24757114052772522, "learning_rate": 5.590751617301226e-06, "loss": 0.8093, "num_input_tokens_seen": 31167944, "step": 54025 }, { "epoch": 8.047363717605004, "grad_norm": 0.20852024853229523, "learning_rate": 5.5866561987018176e-06, "loss": 0.8118, "num_input_tokens_seen": 31170536, "step": 54030 }, { "epoch": 8.048108430145964, "grad_norm": 0.2539699375629425, "learning_rate": 5.582562091977253e-06, "loss": 0.7547, "num_input_tokens_seen": 31173736, "step": 54035 }, { "epoch": 8.048853142686923, "grad_norm": 0.21360832452774048, "learning_rate": 5.578469297404204e-06, "loss": 0.7942, "num_input_tokens_seen": 31176872, "step": 54040 }, { "epoch": 8.049597855227882, "grad_norm": 0.2778129279613495, "learning_rate": 5.574377815259229e-06, "loss": 0.7828, "num_input_tokens_seen": 31179784, "step": 54045 }, { "epoch": 8.05034256776884, "grad_norm": 0.18042698502540588, "learning_rate": 5.570287645818825e-06, "loss": 0.7868, "num_input_tokens_seen": 31182536, "step": 54050 }, { "epoch": 8.051087280309801, "grad_norm": 0.21708904206752777, "learning_rate": 5.566198789359392e-06, "loss": 0.7888, "num_input_tokens_seen": 31185352, "step": 54055 }, { "epoch": 8.05183199285076, "grad_norm": 0.2101367712020874, "learning_rate": 5.562111246157228e-06, "loss": 0.789, "num_input_tokens_seen": 31188360, "step": 54060 }, { "epoch": 8.052576705391719, "grad_norm": 0.2240515500307083, "learning_rate": 5.558025016488555e-06, "loss": 0.8097, "num_input_tokens_seen": 31191272, "step": 54065 }, { "epoch": 8.053321417932677, "grad_norm": 0.2863628566265106, "learning_rate": 5.553940100629507e-06, "loss": 0.7861, "num_input_tokens_seen": 31193992, "step": 54070 }, { "epoch": 8.054066130473638, "grad_norm": 0.18810856342315674, "learning_rate": 5.549856498856129e-06, "loss": 0.7928, "num_input_tokens_seen": 31196712, "step": 54075 }, { "epoch": 8.054810843014597, "grad_norm": 0.2839813828468323, "learning_rate": 5.545774211444369e-06, "loss": 0.7897, "num_input_tokens_seen": 31199848, "step": 54080 }, { "epoch": 8.055555555555555, "grad_norm": 0.23979437351226807, "learning_rate": 5.541693238670087e-06, "loss": 0.7997, "num_input_tokens_seen": 31202696, "step": 54085 }, { "epoch": 8.056300268096514, "grad_norm": 0.26838478446006775, "learning_rate": 5.537613580809067e-06, "loss": 0.8176, "num_input_tokens_seen": 31205320, "step": 54090 }, { "epoch": 8.057044980637475, "grad_norm": 0.24641500413417816, "learning_rate": 5.533535238137e-06, "loss": 0.8027, "num_input_tokens_seen": 31208584, "step": 54095 }, { "epoch": 8.057789693178433, "grad_norm": 0.18622560799121857, "learning_rate": 5.5294582109294696e-06, "loss": 0.7991, "num_input_tokens_seen": 31211624, "step": 54100 }, { "epoch": 8.058534405719392, "grad_norm": 0.14533410966396332, "learning_rate": 5.525382499461993e-06, "loss": 0.798, "num_input_tokens_seen": 31214376, "step": 54105 }, { "epoch": 8.059279118260351, "grad_norm": 0.25851210951805115, "learning_rate": 5.521308104009992e-06, "loss": 0.7906, "num_input_tokens_seen": 31217288, "step": 54110 }, { "epoch": 8.060023830801311, "grad_norm": 0.1867808848619461, "learning_rate": 5.517235024848791e-06, "loss": 0.7963, "num_input_tokens_seen": 31220040, "step": 54115 }, { "epoch": 8.06076854334227, "grad_norm": 0.28369835019111633, "learning_rate": 5.513163262253635e-06, "loss": 0.8214, "num_input_tokens_seen": 31222984, "step": 54120 }, { "epoch": 8.061513255883229, "grad_norm": 0.14720718562602997, "learning_rate": 5.509092816499678e-06, "loss": 0.7909, "num_input_tokens_seen": 31225928, "step": 54125 }, { "epoch": 8.062257968424188, "grad_norm": 0.3101530373096466, "learning_rate": 5.505023687861985e-06, "loss": 0.844, "num_input_tokens_seen": 31229192, "step": 54130 }, { "epoch": 8.063002680965148, "grad_norm": 0.205892875790596, "learning_rate": 5.500955876615538e-06, "loss": 0.8173, "num_input_tokens_seen": 31232136, "step": 54135 }, { "epoch": 8.063747393506107, "grad_norm": 0.3131982982158661, "learning_rate": 5.496889383035206e-06, "loss": 0.7768, "num_input_tokens_seen": 31235080, "step": 54140 }, { "epoch": 8.064492106047066, "grad_norm": 0.24896658957004547, "learning_rate": 5.492824207395805e-06, "loss": 0.831, "num_input_tokens_seen": 31238152, "step": 54145 }, { "epoch": 8.065236818588025, "grad_norm": 0.22438524663448334, "learning_rate": 5.4887603499720244e-06, "loss": 0.8005, "num_input_tokens_seen": 31240968, "step": 54150 }, { "epoch": 8.065981531128985, "grad_norm": 0.1994887888431549, "learning_rate": 5.484697811038494e-06, "loss": 0.7905, "num_input_tokens_seen": 31243880, "step": 54155 }, { "epoch": 8.066726243669944, "grad_norm": 0.2369561344385147, "learning_rate": 5.480636590869742e-06, "loss": 0.8104, "num_input_tokens_seen": 31246888, "step": 54160 }, { "epoch": 8.067470956210903, "grad_norm": 0.2734241187572479, "learning_rate": 5.476576689740209e-06, "loss": 0.8282, "num_input_tokens_seen": 31250120, "step": 54165 }, { "epoch": 8.068215668751861, "grad_norm": 0.18451358377933502, "learning_rate": 5.472518107924255e-06, "loss": 0.7542, "num_input_tokens_seen": 31253128, "step": 54170 }, { "epoch": 8.06896038129282, "grad_norm": 0.193407341837883, "learning_rate": 5.468460845696133e-06, "loss": 0.793, "num_input_tokens_seen": 31255944, "step": 54175 }, { "epoch": 8.06970509383378, "grad_norm": 0.21561786532402039, "learning_rate": 5.4644049033300085e-06, "loss": 0.8024, "num_input_tokens_seen": 31258760, "step": 54180 }, { "epoch": 8.07044980637474, "grad_norm": 0.15564896166324615, "learning_rate": 5.460350281099977e-06, "loss": 0.8164, "num_input_tokens_seen": 31261672, "step": 54185 }, { "epoch": 8.071194518915698, "grad_norm": 0.27233555912971497, "learning_rate": 5.4562969792800315e-06, "loss": 0.8144, "num_input_tokens_seen": 31264552, "step": 54190 }, { "epoch": 8.071939231456657, "grad_norm": 0.20602166652679443, "learning_rate": 5.452244998144076e-06, "loss": 0.809, "num_input_tokens_seen": 31267464, "step": 54195 }, { "epoch": 8.072683943997617, "grad_norm": 0.21189023554325104, "learning_rate": 5.448194337965931e-06, "loss": 0.774, "num_input_tokens_seen": 31270024, "step": 54200 }, { "epoch": 8.073428656538576, "grad_norm": 0.17541134357452393, "learning_rate": 5.444144999019324e-06, "loss": 0.778, "num_input_tokens_seen": 31272872, "step": 54205 }, { "epoch": 8.074173369079535, "grad_norm": 0.19057433307170868, "learning_rate": 5.4400969815778854e-06, "loss": 0.7921, "num_input_tokens_seen": 31275848, "step": 54210 }, { "epoch": 8.074918081620494, "grad_norm": 0.20870453119277954, "learning_rate": 5.436050285915173e-06, "loss": 0.8097, "num_input_tokens_seen": 31278824, "step": 54215 }, { "epoch": 8.075662794161454, "grad_norm": 0.3813979923725128, "learning_rate": 5.432004912304636e-06, "loss": 0.788, "num_input_tokens_seen": 31282152, "step": 54220 }, { "epoch": 8.076407506702413, "grad_norm": 0.18315577507019043, "learning_rate": 5.427960861019648e-06, "loss": 0.8119, "num_input_tokens_seen": 31285032, "step": 54225 }, { "epoch": 8.077152219243372, "grad_norm": 0.2719988524913788, "learning_rate": 5.423918132333491e-06, "loss": 0.7966, "num_input_tokens_seen": 31287944, "step": 54230 }, { "epoch": 8.07789693178433, "grad_norm": 0.21385706961154938, "learning_rate": 5.4198767265193574e-06, "loss": 0.808, "num_input_tokens_seen": 31290856, "step": 54235 }, { "epoch": 8.078641644325291, "grad_norm": 0.25474053621292114, "learning_rate": 5.415836643850352e-06, "loss": 0.8075, "num_input_tokens_seen": 31293704, "step": 54240 }, { "epoch": 8.07938635686625, "grad_norm": 0.18147000670433044, "learning_rate": 5.411797884599479e-06, "loss": 0.7875, "num_input_tokens_seen": 31296680, "step": 54245 }, { "epoch": 8.080131069407209, "grad_norm": 0.19465023279190063, "learning_rate": 5.407760449039662e-06, "loss": 0.7801, "num_input_tokens_seen": 31299528, "step": 54250 }, { "epoch": 8.080875781948167, "grad_norm": 0.2212512344121933, "learning_rate": 5.403724337443747e-06, "loss": 0.7845, "num_input_tokens_seen": 31302216, "step": 54255 }, { "epoch": 8.081620494489128, "grad_norm": 0.22020967304706573, "learning_rate": 5.399689550084461e-06, "loss": 0.8441, "num_input_tokens_seen": 31305128, "step": 54260 }, { "epoch": 8.082365207030087, "grad_norm": 0.19838008284568787, "learning_rate": 5.395656087234466e-06, "loss": 0.7905, "num_input_tokens_seen": 31307784, "step": 54265 }, { "epoch": 8.083109919571045, "grad_norm": 0.22550837695598602, "learning_rate": 5.391623949166327e-06, "loss": 0.8138, "num_input_tokens_seen": 31310408, "step": 54270 }, { "epoch": 8.083854632112004, "grad_norm": 0.2539621889591217, "learning_rate": 5.387593136152527e-06, "loss": 0.7974, "num_input_tokens_seen": 31313576, "step": 54275 }, { "epoch": 8.084599344652965, "grad_norm": 0.19815313816070557, "learning_rate": 5.383563648465437e-06, "loss": 0.8058, "num_input_tokens_seen": 31316488, "step": 54280 }, { "epoch": 8.085344057193923, "grad_norm": 0.13994771242141724, "learning_rate": 5.37953548637736e-06, "loss": 0.7963, "num_input_tokens_seen": 31319336, "step": 54285 }, { "epoch": 8.086088769734882, "grad_norm": 0.2787477374076843, "learning_rate": 5.375508650160507e-06, "loss": 0.7838, "num_input_tokens_seen": 31322152, "step": 54290 }, { "epoch": 8.086833482275841, "grad_norm": 0.2329527884721756, "learning_rate": 5.371483140086997e-06, "loss": 0.8109, "num_input_tokens_seen": 31325064, "step": 54295 }, { "epoch": 8.087578194816802, "grad_norm": 0.27051442861557007, "learning_rate": 5.367458956428845e-06, "loss": 0.8127, "num_input_tokens_seen": 31328200, "step": 54300 }, { "epoch": 8.08832290735776, "grad_norm": 0.20791354775428772, "learning_rate": 5.363436099457997e-06, "loss": 0.7838, "num_input_tokens_seen": 31331176, "step": 54305 }, { "epoch": 8.089067619898719, "grad_norm": 0.22162359952926636, "learning_rate": 5.359414569446308e-06, "loss": 0.8174, "num_input_tokens_seen": 31333864, "step": 54310 }, { "epoch": 8.089812332439678, "grad_norm": 0.18583418428897858, "learning_rate": 5.355394366665525e-06, "loss": 0.7841, "num_input_tokens_seen": 31336648, "step": 54315 }, { "epoch": 8.090557044980638, "grad_norm": 0.1959998905658722, "learning_rate": 5.35137549138732e-06, "loss": 0.7979, "num_input_tokens_seen": 31339304, "step": 54320 }, { "epoch": 8.091301757521597, "grad_norm": 0.2451862096786499, "learning_rate": 5.347357943883272e-06, "loss": 0.7875, "num_input_tokens_seen": 31342024, "step": 54325 }, { "epoch": 8.092046470062556, "grad_norm": 0.20750722289085388, "learning_rate": 5.343341724424875e-06, "loss": 0.7931, "num_input_tokens_seen": 31344968, "step": 54330 }, { "epoch": 8.092791182603515, "grad_norm": 0.21527676284313202, "learning_rate": 5.339326833283531e-06, "loss": 0.8, "num_input_tokens_seen": 31347816, "step": 54335 }, { "epoch": 8.093535895144473, "grad_norm": 0.20551615953445435, "learning_rate": 5.335313270730546e-06, "loss": 0.7865, "num_input_tokens_seen": 31350536, "step": 54340 }, { "epoch": 8.094280607685434, "grad_norm": 0.19457773864269257, "learning_rate": 5.331301037037132e-06, "loss": 0.7993, "num_input_tokens_seen": 31353384, "step": 54345 }, { "epoch": 8.095025320226393, "grad_norm": 0.19261988997459412, "learning_rate": 5.327290132474427e-06, "loss": 0.787, "num_input_tokens_seen": 31356584, "step": 54350 }, { "epoch": 8.095770032767351, "grad_norm": 0.21712914109230042, "learning_rate": 5.323280557313473e-06, "loss": 0.7922, "num_input_tokens_seen": 31359304, "step": 54355 }, { "epoch": 8.09651474530831, "grad_norm": 0.20500625669956207, "learning_rate": 5.319272311825216e-06, "loss": 0.8174, "num_input_tokens_seen": 31362216, "step": 54360 }, { "epoch": 8.09725945784927, "grad_norm": 0.2140784114599228, "learning_rate": 5.315265396280522e-06, "loss": 0.7853, "num_input_tokens_seen": 31364968, "step": 54365 }, { "epoch": 8.09800417039023, "grad_norm": 0.3817576766014099, "learning_rate": 5.311259810950167e-06, "loss": 0.7893, "num_input_tokens_seen": 31368072, "step": 54370 }, { "epoch": 8.098748882931188, "grad_norm": 0.22645175457000732, "learning_rate": 5.3072555561048255e-06, "loss": 0.7837, "num_input_tokens_seen": 31370728, "step": 54375 }, { "epoch": 8.099493595472147, "grad_norm": 0.21121366322040558, "learning_rate": 5.303252632015082e-06, "loss": 0.7936, "num_input_tokens_seen": 31373672, "step": 54380 }, { "epoch": 8.100238308013108, "grad_norm": 0.21064402163028717, "learning_rate": 5.299251038951444e-06, "loss": 0.8208, "num_input_tokens_seen": 31376616, "step": 54385 }, { "epoch": 8.100983020554066, "grad_norm": 0.32283493876457214, "learning_rate": 5.295250777184324e-06, "loss": 0.7934, "num_input_tokens_seen": 31379656, "step": 54390 }, { "epoch": 8.101727733095025, "grad_norm": 0.2572127878665924, "learning_rate": 5.2912518469840436e-06, "loss": 0.7899, "num_input_tokens_seen": 31382600, "step": 54395 }, { "epoch": 8.102472445635984, "grad_norm": 0.2708246111869812, "learning_rate": 5.287254248620832e-06, "loss": 0.8065, "num_input_tokens_seen": 31385352, "step": 54400 }, { "epoch": 8.103217158176944, "grad_norm": 0.312072217464447, "learning_rate": 5.283257982364839e-06, "loss": 0.7861, "num_input_tokens_seen": 31388232, "step": 54405 }, { "epoch": 8.103961870717903, "grad_norm": 0.2564032971858978, "learning_rate": 5.279263048486102e-06, "loss": 0.786, "num_input_tokens_seen": 31390952, "step": 54410 }, { "epoch": 8.104706583258862, "grad_norm": 0.20864714682102203, "learning_rate": 5.275269447254597e-06, "loss": 0.7969, "num_input_tokens_seen": 31393736, "step": 54415 }, { "epoch": 8.10545129579982, "grad_norm": 0.22805187106132507, "learning_rate": 5.271277178940182e-06, "loss": 0.7794, "num_input_tokens_seen": 31396776, "step": 54420 }, { "epoch": 8.106196008340781, "grad_norm": 0.21277296543121338, "learning_rate": 5.267286243812641e-06, "loss": 0.7893, "num_input_tokens_seen": 31400104, "step": 54425 }, { "epoch": 8.10694072088174, "grad_norm": 0.26090243458747864, "learning_rate": 5.263296642141671e-06, "loss": 0.8096, "num_input_tokens_seen": 31403016, "step": 54430 }, { "epoch": 8.107685433422699, "grad_norm": 0.2090785801410675, "learning_rate": 5.25930837419687e-06, "loss": 0.797, "num_input_tokens_seen": 31405800, "step": 54435 }, { "epoch": 8.108430145963657, "grad_norm": 0.28327295184135437, "learning_rate": 5.2553214402477565e-06, "loss": 0.7855, "num_input_tokens_seen": 31408744, "step": 54440 }, { "epoch": 8.109174858504618, "grad_norm": 0.20735076069831848, "learning_rate": 5.251335840563737e-06, "loss": 0.7799, "num_input_tokens_seen": 31411432, "step": 54445 }, { "epoch": 8.109919571045577, "grad_norm": 0.1991066336631775, "learning_rate": 5.247351575414148e-06, "loss": 0.7905, "num_input_tokens_seen": 31414408, "step": 54450 }, { "epoch": 8.110664283586535, "grad_norm": 0.29319852590560913, "learning_rate": 5.243368645068239e-06, "loss": 0.7988, "num_input_tokens_seen": 31417256, "step": 54455 }, { "epoch": 8.111408996127494, "grad_norm": 0.25998854637145996, "learning_rate": 5.239387049795144e-06, "loss": 0.8046, "num_input_tokens_seen": 31420040, "step": 54460 }, { "epoch": 8.112153708668455, "grad_norm": 0.21228976547718048, "learning_rate": 5.235406789863934e-06, "loss": 0.8044, "num_input_tokens_seen": 31422824, "step": 54465 }, { "epoch": 8.112898421209414, "grad_norm": 0.23522883653640747, "learning_rate": 5.2314278655435726e-06, "loss": 0.8026, "num_input_tokens_seen": 31425800, "step": 54470 }, { "epoch": 8.113643133750372, "grad_norm": 0.22705809772014618, "learning_rate": 5.227450277102952e-06, "loss": 0.7734, "num_input_tokens_seen": 31428744, "step": 54475 }, { "epoch": 8.114387846291331, "grad_norm": 0.19204623997211456, "learning_rate": 5.223474024810846e-06, "loss": 0.817, "num_input_tokens_seen": 31431624, "step": 54480 }, { "epoch": 8.115132558832292, "grad_norm": 0.2617115080356598, "learning_rate": 5.219499108935957e-06, "loss": 0.7881, "num_input_tokens_seen": 31434536, "step": 54485 }, { "epoch": 8.11587727137325, "grad_norm": 0.16449496150016785, "learning_rate": 5.215525529746901e-06, "loss": 0.792, "num_input_tokens_seen": 31437416, "step": 54490 }, { "epoch": 8.116621983914209, "grad_norm": 0.23811495304107666, "learning_rate": 5.211553287512189e-06, "loss": 0.8029, "num_input_tokens_seen": 31440200, "step": 54495 }, { "epoch": 8.117366696455168, "grad_norm": 0.31587114930152893, "learning_rate": 5.207582382500259e-06, "loss": 0.7789, "num_input_tokens_seen": 31443240, "step": 54500 }, { "epoch": 8.118111408996128, "grad_norm": 0.2566671073436737, "learning_rate": 5.203612814979442e-06, "loss": 0.8002, "num_input_tokens_seen": 31446056, "step": 54505 }, { "epoch": 8.118856121537087, "grad_norm": 0.19894130527973175, "learning_rate": 5.199644585217978e-06, "loss": 0.7768, "num_input_tokens_seen": 31448968, "step": 54510 }, { "epoch": 8.119600834078046, "grad_norm": 0.3024330735206604, "learning_rate": 5.19567769348403e-06, "loss": 0.802, "num_input_tokens_seen": 31452072, "step": 54515 }, { "epoch": 8.120345546619005, "grad_norm": 0.29375794529914856, "learning_rate": 5.1917121400456654e-06, "loss": 0.7793, "num_input_tokens_seen": 31455208, "step": 54520 }, { "epoch": 8.121090259159963, "grad_norm": 0.18974347412586212, "learning_rate": 5.187747925170858e-06, "loss": 0.8132, "num_input_tokens_seen": 31458120, "step": 54525 }, { "epoch": 8.121834971700924, "grad_norm": 0.2747560143470764, "learning_rate": 5.1837850491274985e-06, "loss": 0.7952, "num_input_tokens_seen": 31460712, "step": 54530 }, { "epoch": 8.122579684241883, "grad_norm": 0.2057218700647354, "learning_rate": 5.179823512183382e-06, "loss": 0.8031, "num_input_tokens_seen": 31463656, "step": 54535 }, { "epoch": 8.123324396782841, "grad_norm": 0.18749962747097015, "learning_rate": 5.175863314606211e-06, "loss": 0.809, "num_input_tokens_seen": 31466504, "step": 54540 }, { "epoch": 8.1240691093238, "grad_norm": 0.16967260837554932, "learning_rate": 5.171904456663592e-06, "loss": 0.7982, "num_input_tokens_seen": 31469800, "step": 54545 }, { "epoch": 8.12481382186476, "grad_norm": 0.2111748605966568, "learning_rate": 5.167946938623053e-06, "loss": 0.8138, "num_input_tokens_seen": 31472680, "step": 54550 }, { "epoch": 8.12555853440572, "grad_norm": 0.24194222688674927, "learning_rate": 5.16399076075203e-06, "loss": 0.766, "num_input_tokens_seen": 31475496, "step": 54555 }, { "epoch": 8.126303246946678, "grad_norm": 0.31727856397628784, "learning_rate": 5.160035923317863e-06, "loss": 0.8105, "num_input_tokens_seen": 31478408, "step": 54560 }, { "epoch": 8.127047959487637, "grad_norm": 0.20663510262966156, "learning_rate": 5.156082426587808e-06, "loss": 0.7815, "num_input_tokens_seen": 31481480, "step": 54565 }, { "epoch": 8.127792672028598, "grad_norm": 0.3462856709957123, "learning_rate": 5.152130270829025e-06, "loss": 0.8143, "num_input_tokens_seen": 31484232, "step": 54570 }, { "epoch": 8.128537384569556, "grad_norm": 0.2894953787326813, "learning_rate": 5.14817945630858e-06, "loss": 0.8175, "num_input_tokens_seen": 31487080, "step": 54575 }, { "epoch": 8.129282097110515, "grad_norm": 0.25909289717674255, "learning_rate": 5.144229983293461e-06, "loss": 0.773, "num_input_tokens_seen": 31489800, "step": 54580 }, { "epoch": 8.130026809651474, "grad_norm": 0.17454734444618225, "learning_rate": 5.140281852050544e-06, "loss": 0.781, "num_input_tokens_seen": 31492936, "step": 54585 }, { "epoch": 8.130771522192434, "grad_norm": 0.17724204063415527, "learning_rate": 5.136335062846636e-06, "loss": 0.765, "num_input_tokens_seen": 31495944, "step": 54590 }, { "epoch": 8.131516234733393, "grad_norm": 0.20888455212116241, "learning_rate": 5.132389615948446e-06, "loss": 0.8057, "num_input_tokens_seen": 31499496, "step": 54595 }, { "epoch": 8.132260947274352, "grad_norm": 0.174704909324646, "learning_rate": 5.12844551162259e-06, "loss": 0.8099, "num_input_tokens_seen": 31502312, "step": 54600 }, { "epoch": 8.13300565981531, "grad_norm": 0.6109547019004822, "learning_rate": 5.124502750135601e-06, "loss": 0.831, "num_input_tokens_seen": 31505480, "step": 54605 }, { "epoch": 8.133750372356271, "grad_norm": 0.2029867023229599, "learning_rate": 5.120561331753901e-06, "loss": 0.7671, "num_input_tokens_seen": 31508296, "step": 54610 }, { "epoch": 8.13449508489723, "grad_norm": 0.17975729703903198, "learning_rate": 5.116621256743842e-06, "loss": 0.7904, "num_input_tokens_seen": 31511208, "step": 54615 }, { "epoch": 8.135239797438189, "grad_norm": 0.1593552976846695, "learning_rate": 5.112682525371687e-06, "loss": 0.8181, "num_input_tokens_seen": 31514440, "step": 54620 }, { "epoch": 8.135984509979147, "grad_norm": 0.26509106159210205, "learning_rate": 5.108745137903584e-06, "loss": 0.8009, "num_input_tokens_seen": 31517320, "step": 54625 }, { "epoch": 8.136729222520108, "grad_norm": 0.19807472825050354, "learning_rate": 5.104809094605612e-06, "loss": 0.782, "num_input_tokens_seen": 31520360, "step": 54630 }, { "epoch": 8.137473935061067, "grad_norm": 0.1437961608171463, "learning_rate": 5.100874395743752e-06, "loss": 0.7987, "num_input_tokens_seen": 31522920, "step": 54635 }, { "epoch": 8.138218647602026, "grad_norm": 0.21566811203956604, "learning_rate": 5.0969410415839026e-06, "loss": 0.8037, "num_input_tokens_seen": 31525832, "step": 54640 }, { "epoch": 8.138963360142984, "grad_norm": 0.1952158659696579, "learning_rate": 5.093009032391854e-06, "loss": 0.8033, "num_input_tokens_seen": 31528552, "step": 54645 }, { "epoch": 8.139708072683945, "grad_norm": 0.16361145675182343, "learning_rate": 5.089078368433317e-06, "loss": 0.7847, "num_input_tokens_seen": 31531368, "step": 54650 }, { "epoch": 8.140452785224904, "grad_norm": 0.1759740263223648, "learning_rate": 5.0851490499739144e-06, "loss": 0.8111, "num_input_tokens_seen": 31534120, "step": 54655 }, { "epoch": 8.141197497765862, "grad_norm": 0.2639465630054474, "learning_rate": 5.081221077279174e-06, "loss": 0.8231, "num_input_tokens_seen": 31537192, "step": 54660 }, { "epoch": 8.141942210306821, "grad_norm": 0.19147929549217224, "learning_rate": 5.0772944506145254e-06, "loss": 0.7898, "num_input_tokens_seen": 31540168, "step": 54665 }, { "epoch": 8.142686922847782, "grad_norm": 0.149480938911438, "learning_rate": 5.073369170245324e-06, "loss": 0.7784, "num_input_tokens_seen": 31542888, "step": 54670 }, { "epoch": 8.14343163538874, "grad_norm": 0.27955177426338196, "learning_rate": 5.069445236436813e-06, "loss": 0.7833, "num_input_tokens_seen": 31546184, "step": 54675 }, { "epoch": 8.1441763479297, "grad_norm": 0.16976144909858704, "learning_rate": 5.065522649454157e-06, "loss": 0.8321, "num_input_tokens_seen": 31549192, "step": 54680 }, { "epoch": 8.144921060470658, "grad_norm": 0.1963173747062683, "learning_rate": 5.061601409562436e-06, "loss": 0.8012, "num_input_tokens_seen": 31552008, "step": 54685 }, { "epoch": 8.145665773011617, "grad_norm": 0.18688298761844635, "learning_rate": 5.057681517026627e-06, "loss": 0.7808, "num_input_tokens_seen": 31555016, "step": 54690 }, { "epoch": 8.146410485552577, "grad_norm": 0.20292818546295166, "learning_rate": 5.053762972111623e-06, "loss": 0.8063, "num_input_tokens_seen": 31557704, "step": 54695 }, { "epoch": 8.147155198093536, "grad_norm": 0.2292557954788208, "learning_rate": 5.049845775082227e-06, "loss": 0.769, "num_input_tokens_seen": 31560456, "step": 54700 }, { "epoch": 8.147899910634495, "grad_norm": 0.23041898012161255, "learning_rate": 5.045929926203144e-06, "loss": 0.8275, "num_input_tokens_seen": 31563208, "step": 54705 }, { "epoch": 8.148644623175453, "grad_norm": 0.21337653696537018, "learning_rate": 5.04201542573898e-06, "loss": 0.8064, "num_input_tokens_seen": 31566216, "step": 54710 }, { "epoch": 8.149389335716414, "grad_norm": 0.21023644506931305, "learning_rate": 5.0381022739542734e-06, "loss": 0.8079, "num_input_tokens_seen": 31569032, "step": 54715 }, { "epoch": 8.150134048257373, "grad_norm": 0.28234463930130005, "learning_rate": 5.034190471113453e-06, "loss": 0.8068, "num_input_tokens_seen": 31571880, "step": 54720 }, { "epoch": 8.150878760798332, "grad_norm": 0.26190125942230225, "learning_rate": 5.0302800174808654e-06, "loss": 0.7939, "num_input_tokens_seen": 31574888, "step": 54725 }, { "epoch": 8.15162347333929, "grad_norm": 0.19585703313350677, "learning_rate": 5.026370913320766e-06, "loss": 0.7991, "num_input_tokens_seen": 31577800, "step": 54730 }, { "epoch": 8.15236818588025, "grad_norm": 0.17476271092891693, "learning_rate": 5.022463158897317e-06, "loss": 0.8029, "num_input_tokens_seen": 31580456, "step": 54735 }, { "epoch": 8.15311289842121, "grad_norm": 0.17533867061138153, "learning_rate": 5.018556754474588e-06, "loss": 0.789, "num_input_tokens_seen": 31583496, "step": 54740 }, { "epoch": 8.153857610962168, "grad_norm": 0.2067245990037918, "learning_rate": 5.014651700316547e-06, "loss": 0.8212, "num_input_tokens_seen": 31586408, "step": 54745 }, { "epoch": 8.154602323503127, "grad_norm": 0.3546876907348633, "learning_rate": 5.010747996687087e-06, "loss": 0.8011, "num_input_tokens_seen": 31589192, "step": 54750 }, { "epoch": 8.155347036044088, "grad_norm": 0.2452705055475235, "learning_rate": 5.006845643850011e-06, "loss": 0.8141, "num_input_tokens_seen": 31592168, "step": 54755 }, { "epoch": 8.156091748585046, "grad_norm": 0.2512238919734955, "learning_rate": 5.002944642069019e-06, "loss": 0.8309, "num_input_tokens_seen": 31594952, "step": 54760 }, { "epoch": 8.156836461126005, "grad_norm": 0.21105068922042847, "learning_rate": 4.9990449916077234e-06, "loss": 0.8127, "num_input_tokens_seen": 31597960, "step": 54765 }, { "epoch": 8.157581173666964, "grad_norm": 0.16774088144302368, "learning_rate": 4.995146692729661e-06, "loss": 0.8258, "num_input_tokens_seen": 31600712, "step": 54770 }, { "epoch": 8.158325886207924, "grad_norm": 0.19723781943321228, "learning_rate": 4.99124974569824e-06, "loss": 0.7655, "num_input_tokens_seen": 31603816, "step": 54775 }, { "epoch": 8.159070598748883, "grad_norm": 0.18937553465366364, "learning_rate": 4.987354150776819e-06, "loss": 0.8069, "num_input_tokens_seen": 31606600, "step": 54780 }, { "epoch": 8.159815311289842, "grad_norm": 0.20404578745365143, "learning_rate": 4.9834599082286325e-06, "loss": 0.81, "num_input_tokens_seen": 31609544, "step": 54785 }, { "epoch": 8.1605600238308, "grad_norm": 0.20352108776569366, "learning_rate": 4.979567018316847e-06, "loss": 0.7979, "num_input_tokens_seen": 31612904, "step": 54790 }, { "epoch": 8.161304736371761, "grad_norm": 0.20532678067684174, "learning_rate": 4.975675481304523e-06, "loss": 0.7968, "num_input_tokens_seen": 31615656, "step": 54795 }, { "epoch": 8.16204944891272, "grad_norm": 0.2592163681983948, "learning_rate": 4.971785297454637e-06, "loss": 0.7977, "num_input_tokens_seen": 31618536, "step": 54800 }, { "epoch": 8.162794161453679, "grad_norm": 0.3613453209400177, "learning_rate": 4.9678964670300785e-06, "loss": 0.818, "num_input_tokens_seen": 31621512, "step": 54805 }, { "epoch": 8.163538873994638, "grad_norm": 0.2166486531496048, "learning_rate": 4.964008990293626e-06, "loss": 0.8251, "num_input_tokens_seen": 31624296, "step": 54810 }, { "epoch": 8.164283586535598, "grad_norm": 0.26652270555496216, "learning_rate": 4.960122867507983e-06, "loss": 0.7959, "num_input_tokens_seen": 31627400, "step": 54815 }, { "epoch": 8.165028299076557, "grad_norm": 0.2595973610877991, "learning_rate": 4.95623809893577e-06, "loss": 0.7856, "num_input_tokens_seen": 31630408, "step": 54820 }, { "epoch": 8.165773011617516, "grad_norm": 0.2611111104488373, "learning_rate": 4.952354684839486e-06, "loss": 0.8214, "num_input_tokens_seen": 31633352, "step": 54825 }, { "epoch": 8.166517724158474, "grad_norm": 0.28336477279663086, "learning_rate": 4.948472625481565e-06, "loss": 0.7892, "num_input_tokens_seen": 31636264, "step": 54830 }, { "epoch": 8.167262436699435, "grad_norm": 0.24789489805698395, "learning_rate": 4.944591921124348e-06, "loss": 0.8166, "num_input_tokens_seen": 31638920, "step": 54835 }, { "epoch": 8.168007149240394, "grad_norm": 0.26357302069664, "learning_rate": 4.940712572030062e-06, "loss": 0.7976, "num_input_tokens_seen": 31641896, "step": 54840 }, { "epoch": 8.168751861781352, "grad_norm": 0.19452235102653503, "learning_rate": 4.936834578460867e-06, "loss": 0.7969, "num_input_tokens_seen": 31644712, "step": 54845 }, { "epoch": 8.169496574322311, "grad_norm": 0.2353665679693222, "learning_rate": 4.932957940678818e-06, "loss": 0.8008, "num_input_tokens_seen": 31647688, "step": 54850 }, { "epoch": 8.17024128686327, "grad_norm": 0.19707351922988892, "learning_rate": 4.9290826589458854e-06, "loss": 0.7934, "num_input_tokens_seen": 31650280, "step": 54855 }, { "epoch": 8.17098599940423, "grad_norm": 0.35336557030677795, "learning_rate": 4.92520873352395e-06, "loss": 0.804, "num_input_tokens_seen": 31653224, "step": 54860 }, { "epoch": 8.17173071194519, "grad_norm": 0.20018324255943298, "learning_rate": 4.921336164674786e-06, "loss": 0.8037, "num_input_tokens_seen": 31656392, "step": 54865 }, { "epoch": 8.172475424486148, "grad_norm": 0.18759572505950928, "learning_rate": 4.917464952660094e-06, "loss": 0.8, "num_input_tokens_seen": 31659272, "step": 54870 }, { "epoch": 8.173220137027107, "grad_norm": 0.21347492933273315, "learning_rate": 4.9135950977414666e-06, "loss": 0.7964, "num_input_tokens_seen": 31662056, "step": 54875 }, { "epoch": 8.173964849568067, "grad_norm": 0.24084258079528809, "learning_rate": 4.909726600180417e-06, "loss": 0.7883, "num_input_tokens_seen": 31665128, "step": 54880 }, { "epoch": 8.174709562109026, "grad_norm": 0.267819344997406, "learning_rate": 4.9058594602383625e-06, "loss": 0.7851, "num_input_tokens_seen": 31668232, "step": 54885 }, { "epoch": 8.175454274649985, "grad_norm": 0.270952045917511, "learning_rate": 4.9019936781766275e-06, "loss": 0.7949, "num_input_tokens_seen": 31671112, "step": 54890 }, { "epoch": 8.176198987190944, "grad_norm": 0.19569340348243713, "learning_rate": 4.898129254256448e-06, "loss": 0.8023, "num_input_tokens_seen": 31674088, "step": 54895 }, { "epoch": 8.176943699731904, "grad_norm": 0.2665509283542633, "learning_rate": 4.8942661887389715e-06, "loss": 0.7829, "num_input_tokens_seen": 31676680, "step": 54900 }, { "epoch": 8.177688412272863, "grad_norm": 0.228279709815979, "learning_rate": 4.890404481885244e-06, "loss": 0.8173, "num_input_tokens_seen": 31679624, "step": 54905 }, { "epoch": 8.178433124813822, "grad_norm": 0.20869752764701843, "learning_rate": 4.886544133956211e-06, "loss": 0.8111, "num_input_tokens_seen": 31682568, "step": 54910 }, { "epoch": 8.17917783735478, "grad_norm": 0.21695305407047272, "learning_rate": 4.882685145212754e-06, "loss": 0.7892, "num_input_tokens_seen": 31685448, "step": 54915 }, { "epoch": 8.17992254989574, "grad_norm": 0.18104708194732666, "learning_rate": 4.878827515915643e-06, "loss": 0.7615, "num_input_tokens_seen": 31688264, "step": 54920 }, { "epoch": 8.1806672624367, "grad_norm": 0.20503118634223938, "learning_rate": 4.8749712463255605e-06, "loss": 0.8004, "num_input_tokens_seen": 31691336, "step": 54925 }, { "epoch": 8.181411974977658, "grad_norm": 0.22533853352069855, "learning_rate": 4.871116336703099e-06, "loss": 0.8349, "num_input_tokens_seen": 31694376, "step": 54930 }, { "epoch": 8.182156687518617, "grad_norm": 0.24730905890464783, "learning_rate": 4.867262787308765e-06, "loss": 0.8092, "num_input_tokens_seen": 31697352, "step": 54935 }, { "epoch": 8.182901400059578, "grad_norm": 0.20980356633663177, "learning_rate": 4.863410598402959e-06, "loss": 0.8051, "num_input_tokens_seen": 31700232, "step": 54940 }, { "epoch": 8.183646112600536, "grad_norm": 0.5048848390579224, "learning_rate": 4.859559770245986e-06, "loss": 0.7921, "num_input_tokens_seen": 31703240, "step": 54945 }, { "epoch": 8.184390825141495, "grad_norm": 0.22863245010375977, "learning_rate": 4.855710303098082e-06, "loss": 0.7897, "num_input_tokens_seen": 31705992, "step": 54950 }, { "epoch": 8.185135537682454, "grad_norm": 0.2198096513748169, "learning_rate": 4.851862197219373e-06, "loss": 0.7922, "num_input_tokens_seen": 31709192, "step": 54955 }, { "epoch": 8.185880250223414, "grad_norm": 0.3426479697227478, "learning_rate": 4.8480154528699e-06, "loss": 0.8074, "num_input_tokens_seen": 31711880, "step": 54960 }, { "epoch": 8.186624962764373, "grad_norm": 0.2610565423965454, "learning_rate": 4.844170070309612e-06, "loss": 0.7629, "num_input_tokens_seen": 31714568, "step": 54965 }, { "epoch": 8.187369675305332, "grad_norm": 0.2826596200466156, "learning_rate": 4.840326049798369e-06, "loss": 0.7793, "num_input_tokens_seen": 31717480, "step": 54970 }, { "epoch": 8.18811438784629, "grad_norm": 0.2769985795021057, "learning_rate": 4.83648339159592e-06, "loss": 0.8183, "num_input_tokens_seen": 31720488, "step": 54975 }, { "epoch": 8.188859100387251, "grad_norm": 0.22429999709129333, "learning_rate": 4.832642095961953e-06, "loss": 0.8005, "num_input_tokens_seen": 31723368, "step": 54980 }, { "epoch": 8.18960381292821, "grad_norm": 0.20872090756893158, "learning_rate": 4.828802163156032e-06, "loss": 0.7942, "num_input_tokens_seen": 31726344, "step": 54985 }, { "epoch": 8.190348525469169, "grad_norm": 0.2573779225349426, "learning_rate": 4.824963593437648e-06, "loss": 0.7704, "num_input_tokens_seen": 31729096, "step": 54990 }, { "epoch": 8.191093238010128, "grad_norm": 0.22234304249286652, "learning_rate": 4.821126387066202e-06, "loss": 0.7994, "num_input_tokens_seen": 31731976, "step": 54995 }, { "epoch": 8.191837950551088, "grad_norm": 0.24020524322986603, "learning_rate": 4.817290544300998e-06, "loss": 0.7994, "num_input_tokens_seen": 31735304, "step": 55000 }, { "epoch": 8.192582663092047, "grad_norm": 0.1464707851409912, "learning_rate": 4.813456065401237e-06, "loss": 0.8135, "num_input_tokens_seen": 31737928, "step": 55005 }, { "epoch": 8.193327375633006, "grad_norm": 0.2502076029777527, "learning_rate": 4.809622950626041e-06, "loss": 0.7765, "num_input_tokens_seen": 31740776, "step": 55010 }, { "epoch": 8.194072088173964, "grad_norm": 0.16107052564620972, "learning_rate": 4.805791200234441e-06, "loss": 0.7804, "num_input_tokens_seen": 31743432, "step": 55015 }, { "epoch": 8.194816800714925, "grad_norm": 0.1926073282957077, "learning_rate": 4.801960814485373e-06, "loss": 0.7818, "num_input_tokens_seen": 31746376, "step": 55020 }, { "epoch": 8.195561513255884, "grad_norm": 0.2674628496170044, "learning_rate": 4.798131793637667e-06, "loss": 0.7864, "num_input_tokens_seen": 31749256, "step": 55025 }, { "epoch": 8.196306225796842, "grad_norm": 0.1781259924173355, "learning_rate": 4.794304137950079e-06, "loss": 0.8007, "num_input_tokens_seen": 31752072, "step": 55030 }, { "epoch": 8.197050938337801, "grad_norm": 0.41510525345802307, "learning_rate": 4.790477847681274e-06, "loss": 0.7882, "num_input_tokens_seen": 31754920, "step": 55035 }, { "epoch": 8.19779565087876, "grad_norm": 0.14727725088596344, "learning_rate": 4.786652923089804e-06, "loss": 0.8135, "num_input_tokens_seen": 31757672, "step": 55040 }, { "epoch": 8.19854036341972, "grad_norm": 0.22777880728244781, "learning_rate": 4.782829364434146e-06, "loss": 0.7929, "num_input_tokens_seen": 31760712, "step": 55045 }, { "epoch": 8.19928507596068, "grad_norm": 0.1779782772064209, "learning_rate": 4.779007171972685e-06, "loss": 0.8025, "num_input_tokens_seen": 31763560, "step": 55050 }, { "epoch": 8.200029788501638, "grad_norm": 0.2061464637517929, "learning_rate": 4.775186345963706e-06, "loss": 0.7996, "num_input_tokens_seen": 31766056, "step": 55055 }, { "epoch": 8.200774501042597, "grad_norm": 0.23839351534843445, "learning_rate": 4.771366886665412e-06, "loss": 0.8071, "num_input_tokens_seen": 31768712, "step": 55060 }, { "epoch": 8.201519213583557, "grad_norm": 0.2546086609363556, "learning_rate": 4.767548794335894e-06, "loss": 0.8479, "num_input_tokens_seen": 31771656, "step": 55065 }, { "epoch": 8.202263926124516, "grad_norm": 0.25903037190437317, "learning_rate": 4.763732069233173e-06, "loss": 0.776, "num_input_tokens_seen": 31774600, "step": 55070 }, { "epoch": 8.203008638665475, "grad_norm": 0.2026570737361908, "learning_rate": 4.759916711615162e-06, "loss": 0.7838, "num_input_tokens_seen": 31777384, "step": 55075 }, { "epoch": 8.203753351206434, "grad_norm": 0.21238496899604797, "learning_rate": 4.756102721739686e-06, "loss": 0.7949, "num_input_tokens_seen": 31780008, "step": 55080 }, { "epoch": 8.204498063747394, "grad_norm": 0.31623363494873047, "learning_rate": 4.752290099864484e-06, "loss": 0.7946, "num_input_tokens_seen": 31782728, "step": 55085 }, { "epoch": 8.205242776288353, "grad_norm": 0.1661866307258606, "learning_rate": 4.748478846247198e-06, "loss": 0.7866, "num_input_tokens_seen": 31785672, "step": 55090 }, { "epoch": 8.205987488829312, "grad_norm": 0.2430407851934433, "learning_rate": 4.744668961145371e-06, "loss": 0.7908, "num_input_tokens_seen": 31788328, "step": 55095 }, { "epoch": 8.20673220137027, "grad_norm": 0.2531185746192932, "learning_rate": 4.740860444816472e-06, "loss": 0.8091, "num_input_tokens_seen": 31791048, "step": 55100 }, { "epoch": 8.207476913911231, "grad_norm": 0.19204029440879822, "learning_rate": 4.7370532975178575e-06, "loss": 0.7896, "num_input_tokens_seen": 31793768, "step": 55105 }, { "epoch": 8.20822162645219, "grad_norm": 0.21567733585834503, "learning_rate": 4.733247519506789e-06, "loss": 0.7914, "num_input_tokens_seen": 31796840, "step": 55110 }, { "epoch": 8.208966338993148, "grad_norm": 0.3311154842376709, "learning_rate": 4.7294431110404545e-06, "loss": 0.7882, "num_input_tokens_seen": 31799624, "step": 55115 }, { "epoch": 8.209711051534107, "grad_norm": 0.2270793467760086, "learning_rate": 4.725640072375942e-06, "loss": 0.796, "num_input_tokens_seen": 31802344, "step": 55120 }, { "epoch": 8.210455764075068, "grad_norm": 0.2630622684955597, "learning_rate": 4.7218384037702425e-06, "loss": 0.8104, "num_input_tokens_seen": 31805384, "step": 55125 }, { "epoch": 8.211200476616026, "grad_norm": 0.1794511079788208, "learning_rate": 4.71803810548026e-06, "loss": 0.7784, "num_input_tokens_seen": 31808328, "step": 55130 }, { "epoch": 8.211945189156985, "grad_norm": 0.2682420015335083, "learning_rate": 4.714239177762808e-06, "loss": 0.791, "num_input_tokens_seen": 31810952, "step": 55135 }, { "epoch": 8.212689901697944, "grad_norm": 0.27750954031944275, "learning_rate": 4.710441620874589e-06, "loss": 0.7868, "num_input_tokens_seen": 31814120, "step": 55140 }, { "epoch": 8.213434614238905, "grad_norm": 0.20453080534934998, "learning_rate": 4.706645435072243e-06, "loss": 0.8183, "num_input_tokens_seen": 31817448, "step": 55145 }, { "epoch": 8.214179326779863, "grad_norm": 0.27543210983276367, "learning_rate": 4.702850620612284e-06, "loss": 0.7963, "num_input_tokens_seen": 31820264, "step": 55150 }, { "epoch": 8.214924039320822, "grad_norm": 0.23123827576637268, "learning_rate": 4.699057177751157e-06, "loss": 0.8161, "num_input_tokens_seen": 31823112, "step": 55155 }, { "epoch": 8.21566875186178, "grad_norm": 0.2577585279941559, "learning_rate": 4.695265106745209e-06, "loss": 0.823, "num_input_tokens_seen": 31825864, "step": 55160 }, { "epoch": 8.216413464402741, "grad_norm": 0.22374080121517181, "learning_rate": 4.691474407850699e-06, "loss": 0.7899, "num_input_tokens_seen": 31829000, "step": 55165 }, { "epoch": 8.2171581769437, "grad_norm": 0.18100641667842865, "learning_rate": 4.687685081323773e-06, "loss": 0.8039, "num_input_tokens_seen": 31831752, "step": 55170 }, { "epoch": 8.217902889484659, "grad_norm": 0.20562371611595154, "learning_rate": 4.683897127420503e-06, "loss": 0.7903, "num_input_tokens_seen": 31834568, "step": 55175 }, { "epoch": 8.218647602025618, "grad_norm": 0.20298610627651215, "learning_rate": 4.680110546396868e-06, "loss": 0.815, "num_input_tokens_seen": 31837640, "step": 55180 }, { "epoch": 8.219392314566578, "grad_norm": 0.15629200637340546, "learning_rate": 4.676325338508755e-06, "loss": 0.7978, "num_input_tokens_seen": 31840520, "step": 55185 }, { "epoch": 8.220137027107537, "grad_norm": 0.24809132516384125, "learning_rate": 4.672541504011938e-06, "loss": 0.8113, "num_input_tokens_seen": 31843528, "step": 55190 }, { "epoch": 8.220881739648496, "grad_norm": 0.15403388440608978, "learning_rate": 4.668759043162121e-06, "loss": 0.7998, "num_input_tokens_seen": 31846152, "step": 55195 }, { "epoch": 8.221626452189454, "grad_norm": 0.23940293490886688, "learning_rate": 4.664977956214914e-06, "loss": 0.7779, "num_input_tokens_seen": 31849096, "step": 55200 }, { "epoch": 8.222371164730415, "grad_norm": 0.18294134736061096, "learning_rate": 4.661198243425813e-06, "loss": 0.8118, "num_input_tokens_seen": 31851688, "step": 55205 }, { "epoch": 8.223115877271374, "grad_norm": 0.2398579865694046, "learning_rate": 4.6574199050502445e-06, "loss": 0.7763, "num_input_tokens_seen": 31854408, "step": 55210 }, { "epoch": 8.223860589812332, "grad_norm": 0.1806725114583969, "learning_rate": 4.653642941343531e-06, "loss": 0.8076, "num_input_tokens_seen": 31857352, "step": 55215 }, { "epoch": 8.224605302353291, "grad_norm": 0.20262114703655243, "learning_rate": 4.649867352560905e-06, "loss": 0.8187, "num_input_tokens_seen": 31860104, "step": 55220 }, { "epoch": 8.22535001489425, "grad_norm": 0.2928803861141205, "learning_rate": 4.646093138957514e-06, "loss": 0.8098, "num_input_tokens_seen": 31862856, "step": 55225 }, { "epoch": 8.22609472743521, "grad_norm": 0.16532017290592194, "learning_rate": 4.6423203007883886e-06, "loss": 0.7857, "num_input_tokens_seen": 31865576, "step": 55230 }, { "epoch": 8.22683943997617, "grad_norm": 0.2171473354101181, "learning_rate": 4.638548838308493e-06, "loss": 0.8168, "num_input_tokens_seen": 31868584, "step": 55235 }, { "epoch": 8.227584152517128, "grad_norm": 0.27588480710983276, "learning_rate": 4.6347787517726785e-06, "loss": 0.805, "num_input_tokens_seen": 31871304, "step": 55240 }, { "epoch": 8.228328865058087, "grad_norm": 0.21007952094078064, "learning_rate": 4.6310100414357185e-06, "loss": 0.7964, "num_input_tokens_seen": 31874056, "step": 55245 }, { "epoch": 8.229073577599047, "grad_norm": 0.1906416416168213, "learning_rate": 4.6272427075522845e-06, "loss": 0.8137, "num_input_tokens_seen": 31876968, "step": 55250 }, { "epoch": 8.229818290140006, "grad_norm": 0.23162023723125458, "learning_rate": 4.623476750376956e-06, "loss": 0.8536, "num_input_tokens_seen": 31880008, "step": 55255 }, { "epoch": 8.230563002680965, "grad_norm": 0.2502155303955078, "learning_rate": 4.6197121701642286e-06, "loss": 0.8124, "num_input_tokens_seen": 31883240, "step": 55260 }, { "epoch": 8.231307715221924, "grad_norm": 0.16120852530002594, "learning_rate": 4.615948967168496e-06, "loss": 0.7861, "num_input_tokens_seen": 31886120, "step": 55265 }, { "epoch": 8.232052427762884, "grad_norm": 0.18375557661056519, "learning_rate": 4.612187141644056e-06, "loss": 0.7972, "num_input_tokens_seen": 31889192, "step": 55270 }, { "epoch": 8.232797140303843, "grad_norm": 0.23479063808918, "learning_rate": 4.6084266938451135e-06, "loss": 0.8177, "num_input_tokens_seen": 31891880, "step": 55275 }, { "epoch": 8.233541852844802, "grad_norm": 0.1668580174446106, "learning_rate": 4.604667624025788e-06, "loss": 0.7819, "num_input_tokens_seen": 31894472, "step": 55280 }, { "epoch": 8.23428656538576, "grad_norm": 0.2109980285167694, "learning_rate": 4.600909932440103e-06, "loss": 0.7798, "num_input_tokens_seen": 31897448, "step": 55285 }, { "epoch": 8.235031277926721, "grad_norm": 0.20889797806739807, "learning_rate": 4.59715361934199e-06, "loss": 0.7704, "num_input_tokens_seen": 31900040, "step": 55290 }, { "epoch": 8.23577599046768, "grad_norm": 0.2679522633552551, "learning_rate": 4.593398684985281e-06, "loss": 0.8338, "num_input_tokens_seen": 31902856, "step": 55295 }, { "epoch": 8.236520703008638, "grad_norm": 0.21516065299510956, "learning_rate": 4.589645129623729e-06, "loss": 0.7918, "num_input_tokens_seen": 31905672, "step": 55300 }, { "epoch": 8.237265415549597, "grad_norm": 0.19680757820606232, "learning_rate": 4.585892953510978e-06, "loss": 0.7935, "num_input_tokens_seen": 31908552, "step": 55305 }, { "epoch": 8.238010128090558, "grad_norm": 0.18827807903289795, "learning_rate": 4.582142156900576e-06, "loss": 0.7984, "num_input_tokens_seen": 31911400, "step": 55310 }, { "epoch": 8.238754840631517, "grad_norm": 0.16792768239974976, "learning_rate": 4.578392740045994e-06, "loss": 0.8128, "num_input_tokens_seen": 31914024, "step": 55315 }, { "epoch": 8.239499553172475, "grad_norm": 0.18431951105594635, "learning_rate": 4.5746447032006005e-06, "loss": 0.8182, "num_input_tokens_seen": 31916968, "step": 55320 }, { "epoch": 8.240244265713434, "grad_norm": 0.20061877369880676, "learning_rate": 4.570898046617677e-06, "loss": 0.7846, "num_input_tokens_seen": 31919624, "step": 55325 }, { "epoch": 8.240988978254395, "grad_norm": 0.22499670088291168, "learning_rate": 4.567152770550412e-06, "loss": 0.8082, "num_input_tokens_seen": 31922472, "step": 55330 }, { "epoch": 8.241733690795353, "grad_norm": 0.21913377940654755, "learning_rate": 4.563408875251882e-06, "loss": 0.7735, "num_input_tokens_seen": 31925128, "step": 55335 }, { "epoch": 8.242478403336312, "grad_norm": 0.2076420933008194, "learning_rate": 4.5596663609750904e-06, "loss": 0.7789, "num_input_tokens_seen": 31928136, "step": 55340 }, { "epoch": 8.24322311587727, "grad_norm": 0.22198283672332764, "learning_rate": 4.555925227972946e-06, "loss": 0.8345, "num_input_tokens_seen": 31930856, "step": 55345 }, { "epoch": 8.243967828418231, "grad_norm": 0.2865961790084839, "learning_rate": 4.552185476498252e-06, "loss": 0.8173, "num_input_tokens_seen": 31933640, "step": 55350 }, { "epoch": 8.24471254095919, "grad_norm": 0.19599460065364838, "learning_rate": 4.5484471068037275e-06, "loss": 0.8165, "num_input_tokens_seen": 31936744, "step": 55355 }, { "epoch": 8.245457253500149, "grad_norm": 0.2615841031074524, "learning_rate": 4.544710119141996e-06, "loss": 0.8068, "num_input_tokens_seen": 31939624, "step": 55360 }, { "epoch": 8.246201966041108, "grad_norm": 0.2367016226053238, "learning_rate": 4.540974513765597e-06, "loss": 0.8063, "num_input_tokens_seen": 31942696, "step": 55365 }, { "epoch": 8.246946678582066, "grad_norm": 0.16120381653308868, "learning_rate": 4.537240290926955e-06, "loss": 0.8004, "num_input_tokens_seen": 31945736, "step": 55370 }, { "epoch": 8.247691391123027, "grad_norm": 0.1373877227306366, "learning_rate": 4.5335074508784185e-06, "loss": 0.7771, "num_input_tokens_seen": 31948456, "step": 55375 }, { "epoch": 8.248436103663986, "grad_norm": 0.19671502709388733, "learning_rate": 4.529775993872237e-06, "loss": 0.7919, "num_input_tokens_seen": 31951496, "step": 55380 }, { "epoch": 8.249180816204944, "grad_norm": 0.24653862416744232, "learning_rate": 4.526045920160574e-06, "loss": 0.8181, "num_input_tokens_seen": 31954152, "step": 55385 }, { "epoch": 8.249925528745903, "grad_norm": 0.23285384476184845, "learning_rate": 4.522317229995479e-06, "loss": 0.7913, "num_input_tokens_seen": 31956968, "step": 55390 }, { "epoch": 8.250670241286864, "grad_norm": 0.21835574507713318, "learning_rate": 4.518589923628932e-06, "loss": 0.7698, "num_input_tokens_seen": 31959784, "step": 55395 }, { "epoch": 8.251414953827823, "grad_norm": 0.3119858503341675, "learning_rate": 4.514864001312813e-06, "loss": 0.8221, "num_input_tokens_seen": 31962600, "step": 55400 }, { "epoch": 8.252159666368781, "grad_norm": 0.22926387190818787, "learning_rate": 4.511139463298891e-06, "loss": 0.7966, "num_input_tokens_seen": 31965544, "step": 55405 }, { "epoch": 8.25290437890974, "grad_norm": 0.1822068840265274, "learning_rate": 4.507416309838861e-06, "loss": 0.7651, "num_input_tokens_seen": 31968264, "step": 55410 }, { "epoch": 8.2536490914507, "grad_norm": 0.2587415277957916, "learning_rate": 4.503694541184322e-06, "loss": 0.7797, "num_input_tokens_seen": 31971016, "step": 55415 }, { "epoch": 8.25439380399166, "grad_norm": 0.23546728491783142, "learning_rate": 4.499974157586773e-06, "loss": 0.8067, "num_input_tokens_seen": 31974152, "step": 55420 }, { "epoch": 8.255138516532618, "grad_norm": 0.24557431042194366, "learning_rate": 4.49625515929763e-06, "loss": 0.7991, "num_input_tokens_seen": 31976936, "step": 55425 }, { "epoch": 8.255883229073577, "grad_norm": 0.19550804793834686, "learning_rate": 4.492537546568196e-06, "loss": 0.7835, "num_input_tokens_seen": 31979784, "step": 55430 }, { "epoch": 8.256627941614537, "grad_norm": 0.19141072034835815, "learning_rate": 4.488821319649702e-06, "loss": 0.7933, "num_input_tokens_seen": 31982440, "step": 55435 }, { "epoch": 8.257372654155496, "grad_norm": 0.2003641575574875, "learning_rate": 4.485106478793266e-06, "loss": 0.8276, "num_input_tokens_seen": 31985448, "step": 55440 }, { "epoch": 8.258117366696455, "grad_norm": 0.15928314626216888, "learning_rate": 4.481393024249925e-06, "loss": 0.8068, "num_input_tokens_seen": 31988488, "step": 55445 }, { "epoch": 8.258862079237414, "grad_norm": 0.2656005918979645, "learning_rate": 4.477680956270621e-06, "loss": 0.8199, "num_input_tokens_seen": 31991432, "step": 55450 }, { "epoch": 8.259606791778374, "grad_norm": 0.3053394556045532, "learning_rate": 4.4739702751062015e-06, "loss": 0.8, "num_input_tokens_seen": 31994408, "step": 55455 }, { "epoch": 8.260351504319333, "grad_norm": 0.23549897968769073, "learning_rate": 4.470260981007418e-06, "loss": 0.8242, "num_input_tokens_seen": 31997192, "step": 55460 }, { "epoch": 8.261096216860292, "grad_norm": 0.16813626885414124, "learning_rate": 4.466553074224936e-06, "loss": 0.8159, "num_input_tokens_seen": 31999848, "step": 55465 }, { "epoch": 8.26184092940125, "grad_norm": 0.23115138709545135, "learning_rate": 4.462846555009312e-06, "loss": 0.8149, "num_input_tokens_seen": 32002856, "step": 55470 }, { "epoch": 8.262585641942211, "grad_norm": 0.3884211778640747, "learning_rate": 4.459141423611016e-06, "loss": 0.7811, "num_input_tokens_seen": 32005640, "step": 55475 }, { "epoch": 8.26333035448317, "grad_norm": 0.2551940083503723, "learning_rate": 4.455437680280427e-06, "loss": 0.8016, "num_input_tokens_seen": 32008488, "step": 55480 }, { "epoch": 8.264075067024129, "grad_norm": 0.34914594888687134, "learning_rate": 4.451735325267836e-06, "loss": 0.7959, "num_input_tokens_seen": 32011368, "step": 55485 }, { "epoch": 8.264819779565087, "grad_norm": 0.2686424255371094, "learning_rate": 4.448034358823424e-06, "loss": 0.8078, "num_input_tokens_seen": 32014312, "step": 55490 }, { "epoch": 8.265564492106048, "grad_norm": 0.142403244972229, "learning_rate": 4.444334781197301e-06, "loss": 0.832, "num_input_tokens_seen": 32017480, "step": 55495 }, { "epoch": 8.266309204647007, "grad_norm": 0.2291475236415863, "learning_rate": 4.440636592639452e-06, "loss": 0.7903, "num_input_tokens_seen": 32020936, "step": 55500 }, { "epoch": 8.267053917187965, "grad_norm": 0.14479874074459076, "learning_rate": 4.436939793399803e-06, "loss": 0.81, "num_input_tokens_seen": 32023656, "step": 55505 }, { "epoch": 8.267798629728924, "grad_norm": 0.19280080497264862, "learning_rate": 4.433244383728149e-06, "loss": 0.8225, "num_input_tokens_seen": 32026760, "step": 55510 }, { "epoch": 8.268543342269885, "grad_norm": 0.20772258937358856, "learning_rate": 4.429550363874224e-06, "loss": 0.7989, "num_input_tokens_seen": 32029544, "step": 55515 }, { "epoch": 8.269288054810843, "grad_norm": 0.22433172166347504, "learning_rate": 4.4258577340876514e-06, "loss": 0.8008, "num_input_tokens_seen": 32032488, "step": 55520 }, { "epoch": 8.270032767351802, "grad_norm": 0.3425881266593933, "learning_rate": 4.422166494617966e-06, "loss": 0.8038, "num_input_tokens_seen": 32035528, "step": 55525 }, { "epoch": 8.270777479892761, "grad_norm": 0.27732959389686584, "learning_rate": 4.418476645714609e-06, "loss": 0.7919, "num_input_tokens_seen": 32038696, "step": 55530 }, { "epoch": 8.271522192433721, "grad_norm": 0.216991126537323, "learning_rate": 4.414788187626917e-06, "loss": 0.7875, "num_input_tokens_seen": 32041640, "step": 55535 }, { "epoch": 8.27226690497468, "grad_norm": 0.20860129594802856, "learning_rate": 4.411101120604147e-06, "loss": 0.802, "num_input_tokens_seen": 32044520, "step": 55540 }, { "epoch": 8.273011617515639, "grad_norm": 0.19564735889434814, "learning_rate": 4.40741544489546e-06, "loss": 0.7887, "num_input_tokens_seen": 32047592, "step": 55545 }, { "epoch": 8.273756330056598, "grad_norm": 0.2070075124502182, "learning_rate": 4.403731160749907e-06, "loss": 0.8271, "num_input_tokens_seen": 32050120, "step": 55550 }, { "epoch": 8.274501042597556, "grad_norm": 0.2514762878417969, "learning_rate": 4.400048268416465e-06, "loss": 0.8142, "num_input_tokens_seen": 32053160, "step": 55555 }, { "epoch": 8.275245755138517, "grad_norm": 0.23347896337509155, "learning_rate": 4.396366768144009e-06, "loss": 0.8202, "num_input_tokens_seen": 32055848, "step": 55560 }, { "epoch": 8.275990467679476, "grad_norm": 0.2809334099292755, "learning_rate": 4.3926866601813224e-06, "loss": 0.792, "num_input_tokens_seen": 32058632, "step": 55565 }, { "epoch": 8.276735180220435, "grad_norm": 0.23747259378433228, "learning_rate": 4.389007944777082e-06, "loss": 0.7912, "num_input_tokens_seen": 32061256, "step": 55570 }, { "epoch": 8.277479892761393, "grad_norm": 0.2721523642539978, "learning_rate": 4.385330622179887e-06, "loss": 0.8053, "num_input_tokens_seen": 32064392, "step": 55575 }, { "epoch": 8.278224605302354, "grad_norm": 0.2989159822463989, "learning_rate": 4.3816546926382345e-06, "loss": 0.7964, "num_input_tokens_seen": 32067368, "step": 55580 }, { "epoch": 8.278969317843313, "grad_norm": 0.22170114517211914, "learning_rate": 4.377980156400538e-06, "loss": 0.8355, "num_input_tokens_seen": 32070344, "step": 55585 }, { "epoch": 8.279714030384271, "grad_norm": 0.25002428889274597, "learning_rate": 4.374307013715093e-06, "loss": 0.8483, "num_input_tokens_seen": 32073096, "step": 55590 }, { "epoch": 8.28045874292523, "grad_norm": 0.22567763924598694, "learning_rate": 4.370635264830122e-06, "loss": 0.8251, "num_input_tokens_seen": 32076296, "step": 55595 }, { "epoch": 8.28120345546619, "grad_norm": 0.24348174035549164, "learning_rate": 4.366964909993751e-06, "loss": 0.7905, "num_input_tokens_seen": 32078984, "step": 55600 }, { "epoch": 8.28194816800715, "grad_norm": 0.16635073721408844, "learning_rate": 4.363295949453999e-06, "loss": 0.8171, "num_input_tokens_seen": 32081736, "step": 55605 }, { "epoch": 8.282692880548108, "grad_norm": 0.30398035049438477, "learning_rate": 4.3596283834588054e-06, "loss": 0.8055, "num_input_tokens_seen": 32084744, "step": 55610 }, { "epoch": 8.283437593089067, "grad_norm": 0.22885647416114807, "learning_rate": 4.355962212256006e-06, "loss": 0.8235, "num_input_tokens_seen": 32087720, "step": 55615 }, { "epoch": 8.284182305630027, "grad_norm": 0.1766340583562851, "learning_rate": 4.3522974360933475e-06, "loss": 0.8053, "num_input_tokens_seen": 32090504, "step": 55620 }, { "epoch": 8.284927018170986, "grad_norm": 0.2780836820602417, "learning_rate": 4.348634055218489e-06, "loss": 0.7846, "num_input_tokens_seen": 32093640, "step": 55625 }, { "epoch": 8.285671730711945, "grad_norm": 0.18710915744304657, "learning_rate": 4.34497206987897e-06, "loss": 0.8229, "num_input_tokens_seen": 32096552, "step": 55630 }, { "epoch": 8.286416443252904, "grad_norm": 0.20587453246116638, "learning_rate": 4.3413114803222685e-06, "loss": 0.8158, "num_input_tokens_seen": 32099368, "step": 55635 }, { "epoch": 8.287161155793864, "grad_norm": 0.2014167159795761, "learning_rate": 4.33765228679574e-06, "loss": 0.802, "num_input_tokens_seen": 32102152, "step": 55640 }, { "epoch": 8.287905868334823, "grad_norm": 0.1400943249464035, "learning_rate": 4.333994489546661e-06, "loss": 0.7945, "num_input_tokens_seen": 32105192, "step": 55645 }, { "epoch": 8.288650580875782, "grad_norm": 0.25189805030822754, "learning_rate": 4.330338088822214e-06, "loss": 0.7757, "num_input_tokens_seen": 32108264, "step": 55650 }, { "epoch": 8.28939529341674, "grad_norm": 0.22229580581188202, "learning_rate": 4.3266830848694815e-06, "loss": 0.7942, "num_input_tokens_seen": 32111112, "step": 55655 }, { "epoch": 8.290140005957701, "grad_norm": 0.17890511453151703, "learning_rate": 4.3230294779354615e-06, "loss": 0.7603, "num_input_tokens_seen": 32114472, "step": 55660 }, { "epoch": 8.29088471849866, "grad_norm": 0.28350114822387695, "learning_rate": 4.319377268267035e-06, "loss": 0.8131, "num_input_tokens_seen": 32117544, "step": 55665 }, { "epoch": 8.291629431039619, "grad_norm": 0.17237839102745056, "learning_rate": 4.315726456111022e-06, "loss": 0.7847, "num_input_tokens_seen": 32120360, "step": 55670 }, { "epoch": 8.292374143580577, "grad_norm": 0.2532031834125519, "learning_rate": 4.312077041714108e-06, "loss": 0.8195, "num_input_tokens_seen": 32123304, "step": 55675 }, { "epoch": 8.293118856121538, "grad_norm": 0.24929843842983246, "learning_rate": 4.3084290253229185e-06, "loss": 0.7929, "num_input_tokens_seen": 32126568, "step": 55680 }, { "epoch": 8.293863568662497, "grad_norm": 0.16257719695568085, "learning_rate": 4.304782407183971e-06, "loss": 0.7912, "num_input_tokens_seen": 32129416, "step": 55685 }, { "epoch": 8.294608281203455, "grad_norm": 0.21416379511356354, "learning_rate": 4.3011371875436856e-06, "loss": 0.8299, "num_input_tokens_seen": 32132232, "step": 55690 }, { "epoch": 8.295352993744414, "grad_norm": 0.2956811487674713, "learning_rate": 4.2974933666484e-06, "loss": 0.7678, "num_input_tokens_seen": 32135400, "step": 55695 }, { "epoch": 8.296097706285375, "grad_norm": 0.27608028054237366, "learning_rate": 4.293850944744337e-06, "loss": 0.7928, "num_input_tokens_seen": 32138088, "step": 55700 }, { "epoch": 8.296842418826333, "grad_norm": 0.30755528807640076, "learning_rate": 4.290209922077643e-06, "loss": 0.8203, "num_input_tokens_seen": 32141064, "step": 55705 }, { "epoch": 8.297587131367292, "grad_norm": 0.35586386919021606, "learning_rate": 4.286570298894365e-06, "loss": 0.8187, "num_input_tokens_seen": 32143848, "step": 55710 }, { "epoch": 8.298331843908251, "grad_norm": 0.20533987879753113, "learning_rate": 4.282932075440449e-06, "loss": 0.8078, "num_input_tokens_seen": 32146696, "step": 55715 }, { "epoch": 8.299076556449211, "grad_norm": 0.28586500883102417, "learning_rate": 4.279295251961754e-06, "loss": 0.806, "num_input_tokens_seen": 32149896, "step": 55720 }, { "epoch": 8.29982126899017, "grad_norm": 0.17902164161205292, "learning_rate": 4.27565982870404e-06, "loss": 0.8005, "num_input_tokens_seen": 32152872, "step": 55725 }, { "epoch": 8.300565981531129, "grad_norm": 0.21290501952171326, "learning_rate": 4.272025805912982e-06, "loss": 0.7969, "num_input_tokens_seen": 32155592, "step": 55730 }, { "epoch": 8.301310694072088, "grad_norm": 0.2116246223449707, "learning_rate": 4.26839318383414e-06, "loss": 0.8004, "num_input_tokens_seen": 32158312, "step": 55735 }, { "epoch": 8.302055406613047, "grad_norm": 0.32648783922195435, "learning_rate": 4.2647619627129986e-06, "loss": 0.8005, "num_input_tokens_seen": 32161096, "step": 55740 }, { "epoch": 8.302800119154007, "grad_norm": 0.1732299029827118, "learning_rate": 4.261132142794941e-06, "loss": 0.7899, "num_input_tokens_seen": 32163880, "step": 55745 }, { "epoch": 8.303544831694966, "grad_norm": 0.2088930904865265, "learning_rate": 4.25750372432526e-06, "loss": 0.8037, "num_input_tokens_seen": 32166696, "step": 55750 }, { "epoch": 8.304289544235925, "grad_norm": 0.20649902522563934, "learning_rate": 4.2538767075491394e-06, "loss": 0.7871, "num_input_tokens_seen": 32169512, "step": 55755 }, { "epoch": 8.305034256776883, "grad_norm": 0.20081688463687897, "learning_rate": 4.250251092711682e-06, "loss": 0.8143, "num_input_tokens_seen": 32172200, "step": 55760 }, { "epoch": 8.305778969317844, "grad_norm": 0.23417726159095764, "learning_rate": 4.2466268800579026e-06, "loss": 0.8233, "num_input_tokens_seen": 32174952, "step": 55765 }, { "epoch": 8.306523681858803, "grad_norm": 0.19235222041606903, "learning_rate": 4.243004069832693e-06, "loss": 0.7784, "num_input_tokens_seen": 32177768, "step": 55770 }, { "epoch": 8.307268394399761, "grad_norm": 0.3088020980358124, "learning_rate": 4.239382662280875e-06, "loss": 0.7988, "num_input_tokens_seen": 32180648, "step": 55775 }, { "epoch": 8.30801310694072, "grad_norm": 0.3315717875957489, "learning_rate": 4.235762657647172e-06, "loss": 0.8159, "num_input_tokens_seen": 32183400, "step": 55780 }, { "epoch": 8.30875781948168, "grad_norm": 0.21085846424102783, "learning_rate": 4.232144056176207e-06, "loss": 0.8113, "num_input_tokens_seen": 32186120, "step": 55785 }, { "epoch": 8.30950253202264, "grad_norm": 0.2640392482280731, "learning_rate": 4.2285268581125165e-06, "loss": 0.7921, "num_input_tokens_seen": 32188776, "step": 55790 }, { "epoch": 8.310247244563598, "grad_norm": 0.19986601173877716, "learning_rate": 4.224911063700526e-06, "loss": 0.7962, "num_input_tokens_seen": 32191752, "step": 55795 }, { "epoch": 8.310991957104557, "grad_norm": 0.3195001184940338, "learning_rate": 4.221296673184585e-06, "loss": 0.788, "num_input_tokens_seen": 32194664, "step": 55800 }, { "epoch": 8.311736669645517, "grad_norm": 0.2371886819601059, "learning_rate": 4.217683686808929e-06, "loss": 0.8007, "num_input_tokens_seen": 32197384, "step": 55805 }, { "epoch": 8.312481382186476, "grad_norm": 0.27016690373420715, "learning_rate": 4.214072104817715e-06, "loss": 0.7949, "num_input_tokens_seen": 32200360, "step": 55810 }, { "epoch": 8.313226094727435, "grad_norm": 0.22025030851364136, "learning_rate": 4.2104619274549975e-06, "loss": 0.8233, "num_input_tokens_seen": 32203304, "step": 55815 }, { "epoch": 8.313970807268394, "grad_norm": 0.32225531339645386, "learning_rate": 4.2068531549647405e-06, "loss": 0.8084, "num_input_tokens_seen": 32206152, "step": 55820 }, { "epoch": 8.314715519809354, "grad_norm": 0.2597466707229614, "learning_rate": 4.203245787590815e-06, "loss": 0.7722, "num_input_tokens_seen": 32208968, "step": 55825 }, { "epoch": 8.315460232350313, "grad_norm": 0.24441148340702057, "learning_rate": 4.199639825576979e-06, "loss": 0.7759, "num_input_tokens_seen": 32212232, "step": 55830 }, { "epoch": 8.316204944891272, "grad_norm": 0.2642740309238434, "learning_rate": 4.196035269166921e-06, "loss": 0.8144, "num_input_tokens_seen": 32215080, "step": 55835 }, { "epoch": 8.31694965743223, "grad_norm": 0.2264302670955658, "learning_rate": 4.192432118604209e-06, "loss": 0.7906, "num_input_tokens_seen": 32218120, "step": 55840 }, { "epoch": 8.317694369973191, "grad_norm": 0.2155626118183136, "learning_rate": 4.188830374132341e-06, "loss": 0.7718, "num_input_tokens_seen": 32220968, "step": 55845 }, { "epoch": 8.31843908251415, "grad_norm": 0.20575574040412903, "learning_rate": 4.1852300359946996e-06, "loss": 0.789, "num_input_tokens_seen": 32223720, "step": 55850 }, { "epoch": 8.319183795055109, "grad_norm": 0.1828708052635193, "learning_rate": 4.181631104434588e-06, "loss": 0.78, "num_input_tokens_seen": 32226696, "step": 55855 }, { "epoch": 8.319928507596067, "grad_norm": 0.2267059087753296, "learning_rate": 4.178033579695212e-06, "loss": 0.8129, "num_input_tokens_seen": 32229608, "step": 55860 }, { "epoch": 8.320673220137028, "grad_norm": 0.2365676313638687, "learning_rate": 4.174437462019665e-06, "loss": 0.8234, "num_input_tokens_seen": 32232808, "step": 55865 }, { "epoch": 8.321417932677987, "grad_norm": 0.2002774029970169, "learning_rate": 4.170842751650969e-06, "loss": 0.7891, "num_input_tokens_seen": 32235560, "step": 55870 }, { "epoch": 8.322162645218945, "grad_norm": 0.2732222080230713, "learning_rate": 4.167249448832028e-06, "loss": 0.8234, "num_input_tokens_seen": 32238504, "step": 55875 }, { "epoch": 8.322907357759904, "grad_norm": 0.1837315410375595, "learning_rate": 4.163657553805669e-06, "loss": 0.7906, "num_input_tokens_seen": 32241064, "step": 55880 }, { "epoch": 8.323652070300863, "grad_norm": 0.24278631806373596, "learning_rate": 4.160067066814619e-06, "loss": 0.8482, "num_input_tokens_seen": 32243816, "step": 55885 }, { "epoch": 8.324396782841823, "grad_norm": 0.22849375009536743, "learning_rate": 4.156477988101507e-06, "loss": 0.8218, "num_input_tokens_seen": 32246696, "step": 55890 }, { "epoch": 8.325141495382782, "grad_norm": 0.2728506624698639, "learning_rate": 4.152890317908875e-06, "loss": 0.8002, "num_input_tokens_seen": 32249544, "step": 55895 }, { "epoch": 8.325886207923741, "grad_norm": 0.19019541144371033, "learning_rate": 4.149304056479153e-06, "loss": 0.8078, "num_input_tokens_seen": 32252360, "step": 55900 }, { "epoch": 8.3266309204647, "grad_norm": 0.20879943668842316, "learning_rate": 4.145719204054688e-06, "loss": 0.7756, "num_input_tokens_seen": 32254888, "step": 55905 }, { "epoch": 8.32737563300566, "grad_norm": 0.18313156068325043, "learning_rate": 4.1421357608777386e-06, "loss": 0.8256, "num_input_tokens_seen": 32257768, "step": 55910 }, { "epoch": 8.328120345546619, "grad_norm": 0.2703602910041809, "learning_rate": 4.138553727190447e-06, "loss": 0.779, "num_input_tokens_seen": 32260840, "step": 55915 }, { "epoch": 8.328865058087578, "grad_norm": 0.35481834411621094, "learning_rate": 4.134973103234877e-06, "loss": 0.8104, "num_input_tokens_seen": 32263976, "step": 55920 }, { "epoch": 8.329609770628537, "grad_norm": 0.29052358865737915, "learning_rate": 4.131393889252996e-06, "loss": 0.7724, "num_input_tokens_seen": 32267048, "step": 55925 }, { "epoch": 8.330354483169497, "grad_norm": 0.27019229531288147, "learning_rate": 4.127816085486674e-06, "loss": 0.8223, "num_input_tokens_seen": 32269832, "step": 55930 }, { "epoch": 8.331099195710456, "grad_norm": 0.1625482589006424, "learning_rate": 4.124239692177675e-06, "loss": 0.8223, "num_input_tokens_seen": 32272488, "step": 55935 }, { "epoch": 8.331843908251415, "grad_norm": 0.2931975722312927, "learning_rate": 4.120664709567684e-06, "loss": 0.8191, "num_input_tokens_seen": 32275400, "step": 55940 }, { "epoch": 8.332588620792373, "grad_norm": 0.26155024766921997, "learning_rate": 4.117091137898282e-06, "loss": 0.7516, "num_input_tokens_seen": 32278536, "step": 55945 }, { "epoch": 8.333333333333334, "grad_norm": 0.18401744961738586, "learning_rate": 4.113518977410963e-06, "loss": 0.7799, "num_input_tokens_seen": 32281544, "step": 55950 }, { "epoch": 8.334078045874293, "grad_norm": 0.23054476082324982, "learning_rate": 4.109948228347108e-06, "loss": 0.8051, "num_input_tokens_seen": 32284488, "step": 55955 }, { "epoch": 8.334822758415251, "grad_norm": 0.22891265153884888, "learning_rate": 4.1063788909480175e-06, "loss": 0.8222, "num_input_tokens_seen": 32287080, "step": 55960 }, { "epoch": 8.33556747095621, "grad_norm": 0.4097231328487396, "learning_rate": 4.102810965454904e-06, "loss": 0.8262, "num_input_tokens_seen": 32289960, "step": 55965 }, { "epoch": 8.33631218349717, "grad_norm": 0.21433141827583313, "learning_rate": 4.099244452108855e-06, "loss": 0.8083, "num_input_tokens_seen": 32292776, "step": 55970 }, { "epoch": 8.33705689603813, "grad_norm": 0.21340312063694, "learning_rate": 4.0956793511508885e-06, "loss": 0.8161, "num_input_tokens_seen": 32295784, "step": 55975 }, { "epoch": 8.337801608579088, "grad_norm": 0.2665598690509796, "learning_rate": 4.092115662821921e-06, "loss": 0.7907, "num_input_tokens_seen": 32298440, "step": 55980 }, { "epoch": 8.338546321120047, "grad_norm": 0.19165517389774323, "learning_rate": 4.088553387362773e-06, "loss": 0.8004, "num_input_tokens_seen": 32301128, "step": 55985 }, { "epoch": 8.339291033661008, "grad_norm": 0.3266040086746216, "learning_rate": 4.084992525014172e-06, "loss": 0.7694, "num_input_tokens_seen": 32304136, "step": 55990 }, { "epoch": 8.340035746201966, "grad_norm": 0.21564416587352753, "learning_rate": 4.081433076016739e-06, "loss": 0.7902, "num_input_tokens_seen": 32307048, "step": 55995 }, { "epoch": 8.340780458742925, "grad_norm": 0.2307956963777542, "learning_rate": 4.077875040611015e-06, "loss": 0.8154, "num_input_tokens_seen": 32310024, "step": 56000 }, { "epoch": 8.341525171283884, "grad_norm": 0.25410786271095276, "learning_rate": 4.074318419037424e-06, "loss": 0.8096, "num_input_tokens_seen": 32312840, "step": 56005 }, { "epoch": 8.342269883824844, "grad_norm": 0.2096397578716278, "learning_rate": 4.070763211536319e-06, "loss": 0.8291, "num_input_tokens_seen": 32315816, "step": 56010 }, { "epoch": 8.343014596365803, "grad_norm": 0.2026584893465042, "learning_rate": 4.067209418347942e-06, "loss": 0.791, "num_input_tokens_seen": 32318472, "step": 56015 }, { "epoch": 8.343759308906762, "grad_norm": 0.20810174942016602, "learning_rate": 4.063657039712448e-06, "loss": 0.8114, "num_input_tokens_seen": 32321544, "step": 56020 }, { "epoch": 8.34450402144772, "grad_norm": 0.19656722247600555, "learning_rate": 4.0601060758698965e-06, "loss": 0.8127, "num_input_tokens_seen": 32324424, "step": 56025 }, { "epoch": 8.345248733988681, "grad_norm": 0.20048470795154572, "learning_rate": 4.05655652706024e-06, "loss": 0.8326, "num_input_tokens_seen": 32327304, "step": 56030 }, { "epoch": 8.34599344652964, "grad_norm": 0.1713389754295349, "learning_rate": 4.053008393523336e-06, "loss": 0.7986, "num_input_tokens_seen": 32330024, "step": 56035 }, { "epoch": 8.346738159070599, "grad_norm": 0.19124650955200195, "learning_rate": 4.049461675498961e-06, "loss": 0.8096, "num_input_tokens_seen": 32332680, "step": 56040 }, { "epoch": 8.347482871611557, "grad_norm": 0.28729119896888733, "learning_rate": 4.045916373226791e-06, "loss": 0.7881, "num_input_tokens_seen": 32335784, "step": 56045 }, { "epoch": 8.348227584152518, "grad_norm": 0.21357452869415283, "learning_rate": 4.042372486946394e-06, "loss": 0.8193, "num_input_tokens_seen": 32338888, "step": 56050 }, { "epoch": 8.348972296693477, "grad_norm": 0.26513710618019104, "learning_rate": 4.03883001689726e-06, "loss": 0.8075, "num_input_tokens_seen": 32341928, "step": 56055 }, { "epoch": 8.349717009234435, "grad_norm": 0.14677605032920837, "learning_rate": 4.035288963318778e-06, "loss": 0.7984, "num_input_tokens_seen": 32344552, "step": 56060 }, { "epoch": 8.350461721775394, "grad_norm": 0.21240128576755524, "learning_rate": 4.031749326450224e-06, "loss": 0.817, "num_input_tokens_seen": 32347208, "step": 56065 }, { "epoch": 8.351206434316353, "grad_norm": 0.25483083724975586, "learning_rate": 4.028211106530808e-06, "loss": 0.7824, "num_input_tokens_seen": 32350376, "step": 56070 }, { "epoch": 8.351951146857314, "grad_norm": 0.18161636590957642, "learning_rate": 4.024674303799611e-06, "loss": 0.8005, "num_input_tokens_seen": 32353000, "step": 56075 }, { "epoch": 8.352695859398272, "grad_norm": 0.21421878039836884, "learning_rate": 4.021138918495648e-06, "loss": 0.7905, "num_input_tokens_seen": 32355816, "step": 56080 }, { "epoch": 8.353440571939231, "grad_norm": 0.1645270586013794, "learning_rate": 4.017604950857823e-06, "loss": 0.7784, "num_input_tokens_seen": 32358440, "step": 56085 }, { "epoch": 8.35418528448019, "grad_norm": 0.23340314626693726, "learning_rate": 4.014072401124946e-06, "loss": 0.8137, "num_input_tokens_seen": 32361128, "step": 56090 }, { "epoch": 8.35492999702115, "grad_norm": 0.16096313297748566, "learning_rate": 4.0105412695357395e-06, "loss": 0.7533, "num_input_tokens_seen": 32363944, "step": 56095 }, { "epoch": 8.35567470956211, "grad_norm": 0.21506167948246002, "learning_rate": 4.0070115563288105e-06, "loss": 0.7954, "num_input_tokens_seen": 32366568, "step": 56100 }, { "epoch": 8.356419422103068, "grad_norm": 0.22938328981399536, "learning_rate": 4.003483261742691e-06, "loss": 0.8196, "num_input_tokens_seen": 32369352, "step": 56105 }, { "epoch": 8.357164134644027, "grad_norm": 0.23765891790390015, "learning_rate": 3.999956386015813e-06, "loss": 0.8088, "num_input_tokens_seen": 32372296, "step": 56110 }, { "epoch": 8.357908847184987, "grad_norm": 0.21022887527942657, "learning_rate": 3.996430929386494e-06, "loss": 0.8004, "num_input_tokens_seen": 32375176, "step": 56115 }, { "epoch": 8.358653559725946, "grad_norm": 0.2602618932723999, "learning_rate": 3.992906892092979e-06, "loss": 0.8008, "num_input_tokens_seen": 32377928, "step": 56120 }, { "epoch": 8.359398272266905, "grad_norm": 0.2606925666332245, "learning_rate": 3.989384274373409e-06, "loss": 0.811, "num_input_tokens_seen": 32380776, "step": 56125 }, { "epoch": 8.360142984807863, "grad_norm": 0.19522222876548767, "learning_rate": 3.985863076465835e-06, "loss": 0.7775, "num_input_tokens_seen": 32383592, "step": 56130 }, { "epoch": 8.360887697348824, "grad_norm": 0.2582474946975708, "learning_rate": 3.9823432986081876e-06, "loss": 0.7977, "num_input_tokens_seen": 32386856, "step": 56135 }, { "epoch": 8.361632409889783, "grad_norm": 0.30107787251472473, "learning_rate": 3.978824941038328e-06, "loss": 0.796, "num_input_tokens_seen": 32389608, "step": 56140 }, { "epoch": 8.362377122430741, "grad_norm": 0.2142607420682907, "learning_rate": 3.975308003994016e-06, "loss": 0.8, "num_input_tokens_seen": 32392488, "step": 56145 }, { "epoch": 8.3631218349717, "grad_norm": 0.20924875140190125, "learning_rate": 3.971792487712914e-06, "loss": 0.7938, "num_input_tokens_seen": 32395784, "step": 56150 }, { "epoch": 8.36386654751266, "grad_norm": 0.20567180216312408, "learning_rate": 3.968278392432573e-06, "loss": 0.8217, "num_input_tokens_seen": 32398536, "step": 56155 }, { "epoch": 8.36461126005362, "grad_norm": 0.19721999764442444, "learning_rate": 3.964765718390473e-06, "loss": 0.8004, "num_input_tokens_seen": 32401224, "step": 56160 }, { "epoch": 8.365355972594578, "grad_norm": 0.20015603303909302, "learning_rate": 3.961254465823985e-06, "loss": 0.7926, "num_input_tokens_seen": 32404104, "step": 56165 }, { "epoch": 8.366100685135537, "grad_norm": 0.24487295746803284, "learning_rate": 3.957744634970378e-06, "loss": 0.8051, "num_input_tokens_seen": 32407016, "step": 56170 }, { "epoch": 8.366845397676498, "grad_norm": 0.25303423404693604, "learning_rate": 3.954236226066838e-06, "loss": 0.8031, "num_input_tokens_seen": 32410024, "step": 56175 }, { "epoch": 8.367590110217456, "grad_norm": 0.16812148690223694, "learning_rate": 3.950729239350448e-06, "loss": 0.8332, "num_input_tokens_seen": 32412872, "step": 56180 }, { "epoch": 8.368334822758415, "grad_norm": 0.14469406008720398, "learning_rate": 3.947223675058195e-06, "loss": 0.8293, "num_input_tokens_seen": 32415464, "step": 56185 }, { "epoch": 8.369079535299374, "grad_norm": 0.2198633849620819, "learning_rate": 3.943719533426979e-06, "loss": 0.7923, "num_input_tokens_seen": 32418600, "step": 56190 }, { "epoch": 8.369824247840334, "grad_norm": 0.24128256738185883, "learning_rate": 3.940216814693587e-06, "loss": 0.794, "num_input_tokens_seen": 32421320, "step": 56195 }, { "epoch": 8.370568960381293, "grad_norm": 0.1838095486164093, "learning_rate": 3.936715519094716e-06, "loss": 0.7932, "num_input_tokens_seen": 32424136, "step": 56200 }, { "epoch": 8.371313672922252, "grad_norm": 0.174804225564003, "learning_rate": 3.933215646866972e-06, "loss": 0.7868, "num_input_tokens_seen": 32426952, "step": 56205 }, { "epoch": 8.37205838546321, "grad_norm": 0.20072117447853088, "learning_rate": 3.929717198246862e-06, "loss": 0.8039, "num_input_tokens_seen": 32429736, "step": 56210 }, { "epoch": 8.372803098004171, "grad_norm": 0.1834830492734909, "learning_rate": 3.926220173470799e-06, "loss": 0.795, "num_input_tokens_seen": 32432904, "step": 56215 }, { "epoch": 8.37354781054513, "grad_norm": 0.24490542709827423, "learning_rate": 3.9227245727750965e-06, "loss": 0.7789, "num_input_tokens_seen": 32435528, "step": 56220 }, { "epoch": 8.374292523086089, "grad_norm": 0.29007065296173096, "learning_rate": 3.919230396395981e-06, "loss": 0.7791, "num_input_tokens_seen": 32438504, "step": 56225 }, { "epoch": 8.375037235627047, "grad_norm": 0.22842799127101898, "learning_rate": 3.915737644569567e-06, "loss": 0.8193, "num_input_tokens_seen": 32441480, "step": 56230 }, { "epoch": 8.375781948168008, "grad_norm": 0.2284766584634781, "learning_rate": 3.912246317531873e-06, "loss": 0.7904, "num_input_tokens_seen": 32444360, "step": 56235 }, { "epoch": 8.376526660708967, "grad_norm": 0.2154604196548462, "learning_rate": 3.908756415518835e-06, "loss": 0.7936, "num_input_tokens_seen": 32447528, "step": 56240 }, { "epoch": 8.377271373249926, "grad_norm": 0.14664706587791443, "learning_rate": 3.905267938766291e-06, "loss": 0.7711, "num_input_tokens_seen": 32450280, "step": 56245 }, { "epoch": 8.378016085790884, "grad_norm": 0.3144574761390686, "learning_rate": 3.901780887509973e-06, "loss": 0.7622, "num_input_tokens_seen": 32453192, "step": 56250 }, { "epoch": 8.378760798331843, "grad_norm": 0.20153768360614777, "learning_rate": 3.898295261985524e-06, "loss": 0.7851, "num_input_tokens_seen": 32455784, "step": 56255 }, { "epoch": 8.379505510872804, "grad_norm": 0.2132873237133026, "learning_rate": 3.894811062428494e-06, "loss": 0.7993, "num_input_tokens_seen": 32458888, "step": 56260 }, { "epoch": 8.380250223413762, "grad_norm": 0.17377714812755585, "learning_rate": 3.8913282890743195e-06, "loss": 0.8136, "num_input_tokens_seen": 32461608, "step": 56265 }, { "epoch": 8.380994935954721, "grad_norm": 0.25174564123153687, "learning_rate": 3.887846942158363e-06, "loss": 0.7723, "num_input_tokens_seen": 32464360, "step": 56270 }, { "epoch": 8.38173964849568, "grad_norm": 0.2752326428890228, "learning_rate": 3.884367021915869e-06, "loss": 0.7944, "num_input_tokens_seen": 32467304, "step": 56275 }, { "epoch": 8.38248436103664, "grad_norm": 0.23391179740428925, "learning_rate": 3.880888528581999e-06, "loss": 0.7858, "num_input_tokens_seen": 32469864, "step": 56280 }, { "epoch": 8.3832290735776, "grad_norm": 0.2116776704788208, "learning_rate": 3.877411462391822e-06, "loss": 0.8273, "num_input_tokens_seen": 32472648, "step": 56285 }, { "epoch": 8.383973786118558, "grad_norm": 0.45692113041877747, "learning_rate": 3.873935823580299e-06, "loss": 0.7945, "num_input_tokens_seen": 32475560, "step": 56290 }, { "epoch": 8.384718498659517, "grad_norm": 0.20228055119514465, "learning_rate": 3.870461612382306e-06, "loss": 0.8079, "num_input_tokens_seen": 32478440, "step": 56295 }, { "epoch": 8.385463211200477, "grad_norm": 0.21228401362895966, "learning_rate": 3.866988829032603e-06, "loss": 0.7827, "num_input_tokens_seen": 32481224, "step": 56300 }, { "epoch": 8.386207923741436, "grad_norm": 0.24107207357883453, "learning_rate": 3.863517473765877e-06, "loss": 0.815, "num_input_tokens_seen": 32484232, "step": 56305 }, { "epoch": 8.386952636282395, "grad_norm": 0.26856109499931335, "learning_rate": 3.8600475468167056e-06, "loss": 0.7906, "num_input_tokens_seen": 32487176, "step": 56310 }, { "epoch": 8.387697348823353, "grad_norm": 0.19775407016277313, "learning_rate": 3.8565790484195785e-06, "loss": 0.8253, "num_input_tokens_seen": 32490664, "step": 56315 }, { "epoch": 8.388442061364314, "grad_norm": 0.20705321431159973, "learning_rate": 3.853111978808868e-06, "loss": 0.8085, "num_input_tokens_seen": 32493640, "step": 56320 }, { "epoch": 8.389186773905273, "grad_norm": 0.28930938243865967, "learning_rate": 3.849646338218874e-06, "loss": 0.7885, "num_input_tokens_seen": 32496552, "step": 56325 }, { "epoch": 8.389931486446232, "grad_norm": 0.2716778516769409, "learning_rate": 3.846182126883796e-06, "loss": 0.8055, "num_input_tokens_seen": 32499400, "step": 56330 }, { "epoch": 8.39067619898719, "grad_norm": 0.22230395674705505, "learning_rate": 3.842719345037718e-06, "loss": 0.8125, "num_input_tokens_seen": 32502184, "step": 56335 }, { "epoch": 8.39142091152815, "grad_norm": 0.1678769290447235, "learning_rate": 3.83925799291465e-06, "loss": 0.7969, "num_input_tokens_seen": 32504968, "step": 56340 }, { "epoch": 8.39216562406911, "grad_norm": 0.23524701595306396, "learning_rate": 3.835798070748489e-06, "loss": 0.8169, "num_input_tokens_seen": 32507688, "step": 56345 }, { "epoch": 8.392910336610068, "grad_norm": 0.21370017528533936, "learning_rate": 3.8323395787730505e-06, "loss": 0.7883, "num_input_tokens_seen": 32510472, "step": 56350 }, { "epoch": 8.393655049151027, "grad_norm": 0.14986306428909302, "learning_rate": 3.828882517222046e-06, "loss": 0.8098, "num_input_tokens_seen": 32513192, "step": 56355 }, { "epoch": 8.394399761691988, "grad_norm": 0.2735188901424408, "learning_rate": 3.825426886329087e-06, "loss": 0.8062, "num_input_tokens_seen": 32515720, "step": 56360 }, { "epoch": 8.395144474232946, "grad_norm": 0.21755991876125336, "learning_rate": 3.8219726863276826e-06, "loss": 0.8038, "num_input_tokens_seen": 32518664, "step": 56365 }, { "epoch": 8.395889186773905, "grad_norm": 0.22755442559719086, "learning_rate": 3.81851991745126e-06, "loss": 0.7977, "num_input_tokens_seen": 32521800, "step": 56370 }, { "epoch": 8.396633899314864, "grad_norm": 0.32225126028060913, "learning_rate": 3.8150685799331454e-06, "loss": 0.8018, "num_input_tokens_seen": 32524552, "step": 56375 }, { "epoch": 8.397378611855824, "grad_norm": 0.1799653023481369, "learning_rate": 3.811618674006562e-06, "loss": 0.8284, "num_input_tokens_seen": 32527464, "step": 56380 }, { "epoch": 8.398123324396783, "grad_norm": 0.27436158061027527, "learning_rate": 3.8081701999046454e-06, "loss": 0.8223, "num_input_tokens_seen": 32530472, "step": 56385 }, { "epoch": 8.398868036937742, "grad_norm": 0.19629521667957306, "learning_rate": 3.804723157860432e-06, "loss": 0.7913, "num_input_tokens_seen": 32533160, "step": 56390 }, { "epoch": 8.3996127494787, "grad_norm": 0.2377387136220932, "learning_rate": 3.8012775481068517e-06, "loss": 0.8319, "num_input_tokens_seen": 32535976, "step": 56395 }, { "epoch": 8.400357462019661, "grad_norm": 0.24192459881305695, "learning_rate": 3.797833370876744e-06, "loss": 0.7955, "num_input_tokens_seen": 32538952, "step": 56400 }, { "epoch": 8.40110217456062, "grad_norm": 0.36841440200805664, "learning_rate": 3.794390626402855e-06, "loss": 0.7997, "num_input_tokens_seen": 32542056, "step": 56405 }, { "epoch": 8.401846887101579, "grad_norm": 0.25167331099510193, "learning_rate": 3.79094931491783e-06, "loss": 0.7897, "num_input_tokens_seen": 32545320, "step": 56410 }, { "epoch": 8.402591599642538, "grad_norm": 0.27768486738204956, "learning_rate": 3.7875094366542212e-06, "loss": 0.7743, "num_input_tokens_seen": 32548584, "step": 56415 }, { "epoch": 8.403336312183498, "grad_norm": 0.22877325117588043, "learning_rate": 3.7840709918444823e-06, "loss": 0.8348, "num_input_tokens_seen": 32551496, "step": 56420 }, { "epoch": 8.404081024724457, "grad_norm": 0.18361125886440277, "learning_rate": 3.780633980720974e-06, "loss": 0.8094, "num_input_tokens_seen": 32554056, "step": 56425 }, { "epoch": 8.404825737265416, "grad_norm": 0.24630142748355865, "learning_rate": 3.777198403515944e-06, "loss": 0.7867, "num_input_tokens_seen": 32557160, "step": 56430 }, { "epoch": 8.405570449806374, "grad_norm": 0.19266541302204132, "learning_rate": 3.7737642604615624e-06, "loss": 0.7872, "num_input_tokens_seen": 32560040, "step": 56435 }, { "epoch": 8.406315162347333, "grad_norm": 0.24751010537147522, "learning_rate": 3.7703315517898908e-06, "loss": 0.8116, "num_input_tokens_seen": 32562824, "step": 56440 }, { "epoch": 8.407059874888294, "grad_norm": 0.33830931782722473, "learning_rate": 3.7669002777328986e-06, "loss": 0.8055, "num_input_tokens_seen": 32565608, "step": 56445 }, { "epoch": 8.407804587429252, "grad_norm": 0.3125493824481964, "learning_rate": 3.763470438522457e-06, "loss": 0.7943, "num_input_tokens_seen": 32568680, "step": 56450 }, { "epoch": 8.408549299970211, "grad_norm": 0.2799665033817291, "learning_rate": 3.760042034390343e-06, "loss": 0.7809, "num_input_tokens_seen": 32571720, "step": 56455 }, { "epoch": 8.40929401251117, "grad_norm": 0.2397499829530716, "learning_rate": 3.7566150655682364e-06, "loss": 0.796, "num_input_tokens_seen": 32574568, "step": 56460 }, { "epoch": 8.41003872505213, "grad_norm": 0.20760001242160797, "learning_rate": 3.7531895322877096e-06, "loss": 0.8026, "num_input_tokens_seen": 32577352, "step": 56465 }, { "epoch": 8.41078343759309, "grad_norm": 0.23719482123851776, "learning_rate": 3.749765434780253e-06, "loss": 0.7765, "num_input_tokens_seen": 32580488, "step": 56470 }, { "epoch": 8.411528150134048, "grad_norm": 0.15831348299980164, "learning_rate": 3.746342773277256e-06, "loss": 0.7991, "num_input_tokens_seen": 32583496, "step": 56475 }, { "epoch": 8.412272862675007, "grad_norm": 0.2523375153541565, "learning_rate": 3.742921548009995e-06, "loss": 0.8011, "num_input_tokens_seen": 32586184, "step": 56480 }, { "epoch": 8.413017575215967, "grad_norm": 0.2817267179489136, "learning_rate": 3.7395017592096738e-06, "loss": 0.7759, "num_input_tokens_seen": 32589160, "step": 56485 }, { "epoch": 8.413762287756926, "grad_norm": 0.21213601529598236, "learning_rate": 3.7360834071073823e-06, "loss": 0.7963, "num_input_tokens_seen": 32592008, "step": 56490 }, { "epoch": 8.414507000297885, "grad_norm": 0.27669718861579895, "learning_rate": 3.7326664919341308e-06, "loss": 0.8057, "num_input_tokens_seen": 32594984, "step": 56495 }, { "epoch": 8.415251712838844, "grad_norm": 0.2294967919588089, "learning_rate": 3.7292510139208007e-06, "loss": 0.7998, "num_input_tokens_seen": 32597640, "step": 56500 }, { "epoch": 8.415996425379804, "grad_norm": 0.17822642624378204, "learning_rate": 3.725836973298211e-06, "loss": 0.8024, "num_input_tokens_seen": 32600552, "step": 56505 }, { "epoch": 8.416741137920763, "grad_norm": 0.23920938372612, "learning_rate": 3.722424370297062e-06, "loss": 0.787, "num_input_tokens_seen": 32603368, "step": 56510 }, { "epoch": 8.417485850461722, "grad_norm": 0.3350479304790497, "learning_rate": 3.7190132051479697e-06, "loss": 0.8177, "num_input_tokens_seen": 32606120, "step": 56515 }, { "epoch": 8.41823056300268, "grad_norm": 0.28497394919395447, "learning_rate": 3.715603478081439e-06, "loss": 0.8044, "num_input_tokens_seen": 32609192, "step": 56520 }, { "epoch": 8.418975275543641, "grad_norm": 0.24912820756435394, "learning_rate": 3.7121951893278966e-06, "loss": 0.7926, "num_input_tokens_seen": 32612168, "step": 56525 }, { "epoch": 8.4197199880846, "grad_norm": 0.28153905272483826, "learning_rate": 3.708788339117644e-06, "loss": 0.7951, "num_input_tokens_seen": 32615048, "step": 56530 }, { "epoch": 8.420464700625558, "grad_norm": 0.21731673181056976, "learning_rate": 3.7053829276809143e-06, "loss": 0.821, "num_input_tokens_seen": 32617928, "step": 56535 }, { "epoch": 8.421209413166517, "grad_norm": 0.282992959022522, "learning_rate": 3.7019789552478286e-06, "loss": 0.8141, "num_input_tokens_seen": 32620840, "step": 56540 }, { "epoch": 8.421954125707478, "grad_norm": 0.280393123626709, "learning_rate": 3.6985764220484137e-06, "loss": 0.8274, "num_input_tokens_seen": 32623656, "step": 56545 }, { "epoch": 8.422698838248436, "grad_norm": 0.20658527314662933, "learning_rate": 3.695175328312597e-06, "loss": 0.8026, "num_input_tokens_seen": 32626472, "step": 56550 }, { "epoch": 8.423443550789395, "grad_norm": 0.22499334812164307, "learning_rate": 3.6917756742702205e-06, "loss": 0.7871, "num_input_tokens_seen": 32629736, "step": 56555 }, { "epoch": 8.424188263330354, "grad_norm": 0.17751601338386536, "learning_rate": 3.68837746015101e-06, "loss": 0.7778, "num_input_tokens_seen": 32632680, "step": 56560 }, { "epoch": 8.424932975871315, "grad_norm": 0.2266421914100647, "learning_rate": 3.6849806861845997e-06, "loss": 0.8075, "num_input_tokens_seen": 32635464, "step": 56565 }, { "epoch": 8.425677688412273, "grad_norm": 0.3168160915374756, "learning_rate": 3.6815853526005305e-06, "loss": 0.7653, "num_input_tokens_seen": 32638376, "step": 56570 }, { "epoch": 8.426422400953232, "grad_norm": 0.18388321995735168, "learning_rate": 3.678191459628252e-06, "loss": 0.8169, "num_input_tokens_seen": 32641000, "step": 56575 }, { "epoch": 8.42716711349419, "grad_norm": 0.2307003140449524, "learning_rate": 3.6747990074971065e-06, "loss": 0.7956, "num_input_tokens_seen": 32643848, "step": 56580 }, { "epoch": 8.42791182603515, "grad_norm": 0.23631322383880615, "learning_rate": 3.671407996436341e-06, "loss": 0.7974, "num_input_tokens_seen": 32646952, "step": 56585 }, { "epoch": 8.42865653857611, "grad_norm": 0.1986595094203949, "learning_rate": 3.6680184266751128e-06, "loss": 0.8194, "num_input_tokens_seen": 32650344, "step": 56590 }, { "epoch": 8.429401251117069, "grad_norm": 0.2875922918319702, "learning_rate": 3.66463029844247e-06, "loss": 0.8244, "num_input_tokens_seen": 32653128, "step": 56595 }, { "epoch": 8.430145963658028, "grad_norm": 0.1742861270904541, "learning_rate": 3.6612436119673634e-06, "loss": 0.8212, "num_input_tokens_seen": 32655720, "step": 56600 }, { "epoch": 8.430890676198986, "grad_norm": 0.20643720030784607, "learning_rate": 3.657858367478656e-06, "loss": 0.7748, "num_input_tokens_seen": 32658696, "step": 56605 }, { "epoch": 8.431635388739947, "grad_norm": 0.24198751151561737, "learning_rate": 3.6544745652051097e-06, "loss": 0.7903, "num_input_tokens_seen": 32661512, "step": 56610 }, { "epoch": 8.432380101280906, "grad_norm": 0.1470886766910553, "learning_rate": 3.6510922053753864e-06, "loss": 0.7727, "num_input_tokens_seen": 32664456, "step": 56615 }, { "epoch": 8.433124813821864, "grad_norm": 0.30595681071281433, "learning_rate": 3.647711288218053e-06, "loss": 0.816, "num_input_tokens_seen": 32667368, "step": 56620 }, { "epoch": 8.433869526362823, "grad_norm": 0.30299919843673706, "learning_rate": 3.644331813961588e-06, "loss": 0.7942, "num_input_tokens_seen": 32669992, "step": 56625 }, { "epoch": 8.434614238903784, "grad_norm": 0.24696040153503418, "learning_rate": 3.640953782834344e-06, "loss": 0.7833, "num_input_tokens_seen": 32672840, "step": 56630 }, { "epoch": 8.435358951444742, "grad_norm": 0.33073052763938904, "learning_rate": 3.637577195064612e-06, "loss": 0.797, "num_input_tokens_seen": 32675624, "step": 56635 }, { "epoch": 8.436103663985701, "grad_norm": 0.22443482279777527, "learning_rate": 3.634202050880553e-06, "loss": 0.8122, "num_input_tokens_seen": 32678440, "step": 56640 }, { "epoch": 8.43684837652666, "grad_norm": 0.21003358066082, "learning_rate": 3.6308283505102515e-06, "loss": 0.8268, "num_input_tokens_seen": 32681320, "step": 56645 }, { "epoch": 8.43759308906762, "grad_norm": 0.2199946790933609, "learning_rate": 3.6274560941816887e-06, "loss": 0.8004, "num_input_tokens_seen": 32684072, "step": 56650 }, { "epoch": 8.43833780160858, "grad_norm": 0.3017483651638031, "learning_rate": 3.6240852821227524e-06, "loss": 0.7907, "num_input_tokens_seen": 32687016, "step": 56655 }, { "epoch": 8.439082514149538, "grad_norm": 0.2578219473361969, "learning_rate": 3.620715914561226e-06, "loss": 0.815, "num_input_tokens_seen": 32689992, "step": 56660 }, { "epoch": 8.439827226690497, "grad_norm": 0.1895676702260971, "learning_rate": 3.6173479917247927e-06, "loss": 0.7747, "num_input_tokens_seen": 32692712, "step": 56665 }, { "epoch": 8.440571939231457, "grad_norm": 0.16259290277957916, "learning_rate": 3.613981513841047e-06, "loss": 0.7953, "num_input_tokens_seen": 32695528, "step": 56670 }, { "epoch": 8.441316651772416, "grad_norm": 0.2531627118587494, "learning_rate": 3.6106164811374855e-06, "loss": 0.8331, "num_input_tokens_seen": 32698536, "step": 56675 }, { "epoch": 8.442061364313375, "grad_norm": 0.20141565799713135, "learning_rate": 3.607252893841495e-06, "loss": 0.7923, "num_input_tokens_seen": 32701480, "step": 56680 }, { "epoch": 8.442806076854334, "grad_norm": 0.2617969810962677, "learning_rate": 3.6038907521803776e-06, "loss": 0.8203, "num_input_tokens_seen": 32704488, "step": 56685 }, { "epoch": 8.443550789395294, "grad_norm": 0.21364258229732513, "learning_rate": 3.6005300563813375e-06, "loss": 0.7776, "num_input_tokens_seen": 32707464, "step": 56690 }, { "epoch": 8.444295501936253, "grad_norm": 0.18400433659553528, "learning_rate": 3.5971708066714682e-06, "loss": 0.8109, "num_input_tokens_seen": 32710216, "step": 56695 }, { "epoch": 8.445040214477212, "grad_norm": 0.21478401124477386, "learning_rate": 3.593813003277777e-06, "loss": 0.7893, "num_input_tokens_seen": 32713288, "step": 56700 }, { "epoch": 8.44578492701817, "grad_norm": 0.2401418387889862, "learning_rate": 3.5904566464271704e-06, "loss": 0.7889, "num_input_tokens_seen": 32715816, "step": 56705 }, { "epoch": 8.446529639559131, "grad_norm": 0.31777381896972656, "learning_rate": 3.5871017363464596e-06, "loss": 0.7705, "num_input_tokens_seen": 32718664, "step": 56710 }, { "epoch": 8.44727435210009, "grad_norm": 0.2835918664932251, "learning_rate": 3.5837482732623636e-06, "loss": 0.8062, "num_input_tokens_seen": 32721704, "step": 56715 }, { "epoch": 8.448019064641048, "grad_norm": 0.1833871304988861, "learning_rate": 3.5803962574014775e-06, "loss": 0.8073, "num_input_tokens_seen": 32724584, "step": 56720 }, { "epoch": 8.448763777182007, "grad_norm": 0.3478207290172577, "learning_rate": 3.577045688990335e-06, "loss": 0.8198, "num_input_tokens_seen": 32727912, "step": 56725 }, { "epoch": 8.449508489722968, "grad_norm": 0.2523888647556305, "learning_rate": 3.5736965682553385e-06, "loss": 0.7938, "num_input_tokens_seen": 32730472, "step": 56730 }, { "epoch": 8.450253202263927, "grad_norm": 0.16771334409713745, "learning_rate": 3.5703488954228147e-06, "loss": 0.7871, "num_input_tokens_seen": 32733256, "step": 56735 }, { "epoch": 8.450997914804885, "grad_norm": 0.18881702423095703, "learning_rate": 3.5670026707189858e-06, "loss": 0.797, "num_input_tokens_seen": 32736296, "step": 56740 }, { "epoch": 8.451742627345844, "grad_norm": 0.1761874556541443, "learning_rate": 3.5636578943699787e-06, "loss": 0.7885, "num_input_tokens_seen": 32739144, "step": 56745 }, { "epoch": 8.452487339886805, "grad_norm": 0.24763627350330353, "learning_rate": 3.5603145666018132e-06, "loss": 0.8049, "num_input_tokens_seen": 32742248, "step": 56750 }, { "epoch": 8.453232052427763, "grad_norm": 0.29702743887901306, "learning_rate": 3.5569726876404307e-06, "loss": 0.8293, "num_input_tokens_seen": 32745192, "step": 56755 }, { "epoch": 8.453976764968722, "grad_norm": 0.14513258635997772, "learning_rate": 3.553632257711653e-06, "loss": 0.8056, "num_input_tokens_seen": 32748136, "step": 56760 }, { "epoch": 8.45472147750968, "grad_norm": 0.2786065638065338, "learning_rate": 3.550293277041206e-06, "loss": 0.7946, "num_input_tokens_seen": 32751016, "step": 56765 }, { "epoch": 8.45546619005064, "grad_norm": 0.1985386312007904, "learning_rate": 3.54695574585473e-06, "loss": 0.8071, "num_input_tokens_seen": 32754120, "step": 56770 }, { "epoch": 8.4562109025916, "grad_norm": 0.24556593596935272, "learning_rate": 3.543619664377765e-06, "loss": 0.7955, "num_input_tokens_seen": 32756968, "step": 56775 }, { "epoch": 8.456955615132559, "grad_norm": 0.15936116874217987, "learning_rate": 3.540285032835747e-06, "loss": 0.8175, "num_input_tokens_seen": 32759752, "step": 56780 }, { "epoch": 8.457700327673518, "grad_norm": 0.26126715540885925, "learning_rate": 3.536951851454018e-06, "loss": 0.817, "num_input_tokens_seen": 32762888, "step": 56785 }, { "epoch": 8.458445040214476, "grad_norm": 0.21546004712581635, "learning_rate": 3.5336201204578256e-06, "loss": 0.792, "num_input_tokens_seen": 32765544, "step": 56790 }, { "epoch": 8.459189752755437, "grad_norm": 0.20761658251285553, "learning_rate": 3.5302898400723094e-06, "loss": 0.8269, "num_input_tokens_seen": 32768360, "step": 56795 }, { "epoch": 8.459934465296396, "grad_norm": 0.26295873522758484, "learning_rate": 3.5269610105225114e-06, "loss": 0.7902, "num_input_tokens_seen": 32771144, "step": 56800 }, { "epoch": 8.460679177837354, "grad_norm": 0.2503697872161865, "learning_rate": 3.523633632033385e-06, "loss": 0.8007, "num_input_tokens_seen": 32773992, "step": 56805 }, { "epoch": 8.461423890378313, "grad_norm": 0.16323095560073853, "learning_rate": 3.520307704829781e-06, "loss": 0.8351, "num_input_tokens_seen": 32776680, "step": 56810 }, { "epoch": 8.462168602919274, "grad_norm": 0.26730749011039734, "learning_rate": 3.5169832291364502e-06, "loss": 0.8391, "num_input_tokens_seen": 32779656, "step": 56815 }, { "epoch": 8.462913315460233, "grad_norm": 0.1631263941526413, "learning_rate": 3.5136602051780517e-06, "loss": 0.8122, "num_input_tokens_seen": 32782248, "step": 56820 }, { "epoch": 8.463658028001191, "grad_norm": 0.19561952352523804, "learning_rate": 3.5103386331791444e-06, "loss": 0.8161, "num_input_tokens_seen": 32785288, "step": 56825 }, { "epoch": 8.46440274054215, "grad_norm": 0.2632555067539215, "learning_rate": 3.507018513364177e-06, "loss": 0.7758, "num_input_tokens_seen": 32787976, "step": 56830 }, { "epoch": 8.46514745308311, "grad_norm": 0.15858376026153564, "learning_rate": 3.5036998459575197e-06, "loss": 0.781, "num_input_tokens_seen": 32791080, "step": 56835 }, { "epoch": 8.46589216562407, "grad_norm": 0.2468152642250061, "learning_rate": 3.5003826311834214e-06, "loss": 0.8014, "num_input_tokens_seen": 32793608, "step": 56840 }, { "epoch": 8.466636878165028, "grad_norm": 0.20001378655433655, "learning_rate": 3.497066869266058e-06, "loss": 0.7919, "num_input_tokens_seen": 32796328, "step": 56845 }, { "epoch": 8.467381590705987, "grad_norm": 0.3718692362308502, "learning_rate": 3.493752560429486e-06, "loss": 0.809, "num_input_tokens_seen": 32799464, "step": 56850 }, { "epoch": 8.468126303246947, "grad_norm": 0.2493022084236145, "learning_rate": 3.490439704897688e-06, "loss": 0.7977, "num_input_tokens_seen": 32802088, "step": 56855 }, { "epoch": 8.468871015787906, "grad_norm": 0.23208290338516235, "learning_rate": 3.4871283028945155e-06, "loss": 0.7915, "num_input_tokens_seen": 32804776, "step": 56860 }, { "epoch": 8.469615728328865, "grad_norm": 0.19168424606323242, "learning_rate": 3.4838183546437475e-06, "loss": 0.7938, "num_input_tokens_seen": 32807464, "step": 56865 }, { "epoch": 8.470360440869824, "grad_norm": 0.2791783809661865, "learning_rate": 3.480509860369058e-06, "loss": 0.7933, "num_input_tokens_seen": 32810600, "step": 56870 }, { "epoch": 8.471105153410784, "grad_norm": 0.21043811738491058, "learning_rate": 3.477202820294018e-06, "loss": 0.8072, "num_input_tokens_seen": 32813160, "step": 56875 }, { "epoch": 8.471849865951743, "grad_norm": 0.22496849298477173, "learning_rate": 3.473897234642112e-06, "loss": 0.8191, "num_input_tokens_seen": 32815816, "step": 56880 }, { "epoch": 8.472594578492702, "grad_norm": 0.226905956864357, "learning_rate": 3.4705931036367074e-06, "loss": 0.7795, "num_input_tokens_seen": 32818696, "step": 56885 }, { "epoch": 8.47333929103366, "grad_norm": 0.20552141964435577, "learning_rate": 3.4672904275010936e-06, "loss": 0.793, "num_input_tokens_seen": 32821608, "step": 56890 }, { "epoch": 8.474084003574621, "grad_norm": 0.2441989630460739, "learning_rate": 3.463989206458443e-06, "loss": 0.8227, "num_input_tokens_seen": 32824776, "step": 56895 }, { "epoch": 8.47482871611558, "grad_norm": 0.23030517995357513, "learning_rate": 3.460689440731843e-06, "loss": 0.8024, "num_input_tokens_seen": 32827656, "step": 56900 }, { "epoch": 8.475573428656539, "grad_norm": 0.2080305516719818, "learning_rate": 3.457391130544277e-06, "loss": 0.804, "num_input_tokens_seen": 32830440, "step": 56905 }, { "epoch": 8.476318141197497, "grad_norm": 0.2416863590478897, "learning_rate": 3.45409427611863e-06, "loss": 0.805, "num_input_tokens_seen": 32833384, "step": 56910 }, { "epoch": 8.477062853738458, "grad_norm": 0.17614708840847015, "learning_rate": 3.4507988776776968e-06, "loss": 0.8002, "num_input_tokens_seen": 32836328, "step": 56915 }, { "epoch": 8.477807566279417, "grad_norm": 0.18454709649085999, "learning_rate": 3.4475049354441653e-06, "loss": 0.7956, "num_input_tokens_seen": 32839176, "step": 56920 }, { "epoch": 8.478552278820375, "grad_norm": 0.26223719120025635, "learning_rate": 3.444212449640627e-06, "loss": 0.7829, "num_input_tokens_seen": 32842024, "step": 56925 }, { "epoch": 8.479296991361334, "grad_norm": 0.2421216070652008, "learning_rate": 3.4409214204895653e-06, "loss": 0.8222, "num_input_tokens_seen": 32844904, "step": 56930 }, { "epoch": 8.480041703902295, "grad_norm": 0.19867485761642456, "learning_rate": 3.4376318482133797e-06, "loss": 0.8284, "num_input_tokens_seen": 32847560, "step": 56935 }, { "epoch": 8.480786416443253, "grad_norm": 0.17723332345485687, "learning_rate": 3.4343437330343675e-06, "loss": 0.789, "num_input_tokens_seen": 32850408, "step": 56940 }, { "epoch": 8.481531128984212, "grad_norm": 0.23349837958812714, "learning_rate": 3.431057075174729e-06, "loss": 0.8352, "num_input_tokens_seen": 32853352, "step": 56945 }, { "epoch": 8.482275841525171, "grad_norm": 0.23653864860534668, "learning_rate": 3.4277718748565585e-06, "loss": 0.8113, "num_input_tokens_seen": 32856264, "step": 56950 }, { "epoch": 8.48302055406613, "grad_norm": 0.2728346884250641, "learning_rate": 3.4244881323018645e-06, "loss": 0.8284, "num_input_tokens_seen": 32859112, "step": 56955 }, { "epoch": 8.48376526660709, "grad_norm": 0.2814464271068573, "learning_rate": 3.421205847732542e-06, "loss": 0.8236, "num_input_tokens_seen": 32861992, "step": 56960 }, { "epoch": 8.484509979148049, "grad_norm": 0.21332629024982452, "learning_rate": 3.4179250213703914e-06, "loss": 0.7983, "num_input_tokens_seen": 32864840, "step": 56965 }, { "epoch": 8.485254691689008, "grad_norm": 0.2415199726819992, "learning_rate": 3.414645653437118e-06, "loss": 0.8237, "num_input_tokens_seen": 32867944, "step": 56970 }, { "epoch": 8.485999404229966, "grad_norm": 0.2145369052886963, "learning_rate": 3.411367744154334e-06, "loss": 0.7921, "num_input_tokens_seen": 32871112, "step": 56975 }, { "epoch": 8.486744116770927, "grad_norm": 0.20179876685142517, "learning_rate": 3.4080912937435455e-06, "loss": 0.7737, "num_input_tokens_seen": 32874088, "step": 56980 }, { "epoch": 8.487488829311886, "grad_norm": 0.1854400336742401, "learning_rate": 3.4048163024261614e-06, "loss": 0.8043, "num_input_tokens_seen": 32876808, "step": 56985 }, { "epoch": 8.488233541852845, "grad_norm": 0.38330143690109253, "learning_rate": 3.4015427704234965e-06, "loss": 0.8263, "num_input_tokens_seen": 32879752, "step": 56990 }, { "epoch": 8.488978254393803, "grad_norm": 0.2131040394306183, "learning_rate": 3.3982706979567542e-06, "loss": 0.8111, "num_input_tokens_seen": 32882600, "step": 56995 }, { "epoch": 8.489722966934764, "grad_norm": 0.5004626512527466, "learning_rate": 3.395000085247055e-06, "loss": 0.7884, "num_input_tokens_seen": 32885640, "step": 57000 }, { "epoch": 8.490467679475723, "grad_norm": 0.17915041744709015, "learning_rate": 3.391730932515405e-06, "loss": 0.7996, "num_input_tokens_seen": 32888328, "step": 57005 }, { "epoch": 8.491212392016681, "grad_norm": 0.2772871255874634, "learning_rate": 3.388463239982728e-06, "loss": 0.7958, "num_input_tokens_seen": 32891016, "step": 57010 }, { "epoch": 8.49195710455764, "grad_norm": 0.16818471252918243, "learning_rate": 3.3851970078698394e-06, "loss": 0.7865, "num_input_tokens_seen": 32893640, "step": 57015 }, { "epoch": 8.4927018170986, "grad_norm": 0.24529075622558594, "learning_rate": 3.3819322363974615e-06, "loss": 0.7985, "num_input_tokens_seen": 32896584, "step": 57020 }, { "epoch": 8.49344652963956, "grad_norm": 0.21268267929553986, "learning_rate": 3.3786689257862047e-06, "loss": 0.7926, "num_input_tokens_seen": 32899432, "step": 57025 }, { "epoch": 8.494191242180518, "grad_norm": 0.26234495639801025, "learning_rate": 3.3754070762565952e-06, "loss": 0.796, "num_input_tokens_seen": 32902440, "step": 57030 }, { "epoch": 8.494935954721477, "grad_norm": 0.18945759534835815, "learning_rate": 3.372146688029057e-06, "loss": 0.7617, "num_input_tokens_seen": 32906664, "step": 57035 }, { "epoch": 8.495680667262437, "grad_norm": 0.2815546989440918, "learning_rate": 3.368887761323919e-06, "loss": 0.8125, "num_input_tokens_seen": 32909576, "step": 57040 }, { "epoch": 8.496425379803396, "grad_norm": 0.14467038214206696, "learning_rate": 3.3656302963613966e-06, "loss": 0.7916, "num_input_tokens_seen": 32912072, "step": 57045 }, { "epoch": 8.497170092344355, "grad_norm": 0.18538017570972443, "learning_rate": 3.362374293361617e-06, "loss": 0.7869, "num_input_tokens_seen": 32914792, "step": 57050 }, { "epoch": 8.497914804885314, "grad_norm": 0.2092842161655426, "learning_rate": 3.359119752544618e-06, "loss": 0.7794, "num_input_tokens_seen": 32917416, "step": 57055 }, { "epoch": 8.498659517426274, "grad_norm": 0.20658178627490997, "learning_rate": 3.3558666741303147e-06, "loss": 0.8049, "num_input_tokens_seen": 32920200, "step": 57060 }, { "epoch": 8.499404229967233, "grad_norm": 0.21053750813007355, "learning_rate": 3.352615058338543e-06, "loss": 0.8, "num_input_tokens_seen": 32923240, "step": 57065 }, { "epoch": 8.5, "eval_loss": 0.8026010394096375, "eval_runtime": 45.2103, "eval_samples_per_second": 66.003, "eval_steps_per_second": 16.501, "num_input_tokens_seen": 32925544, "step": 57069 }, { "epoch": 8.500148942508192, "grad_norm": 0.2377014011144638, "learning_rate": 3.3493649053890326e-06, "loss": 0.8005, "num_input_tokens_seen": 32926120, "step": 57070 }, { "epoch": 8.50089365504915, "grad_norm": 0.31173476576805115, "learning_rate": 3.3461162155014186e-06, "loss": 0.8174, "num_input_tokens_seen": 32929192, "step": 57075 }, { "epoch": 8.501638367590111, "grad_norm": 0.21772804856300354, "learning_rate": 3.342868988895237e-06, "loss": 0.7945, "num_input_tokens_seen": 32932168, "step": 57080 }, { "epoch": 8.50238308013107, "grad_norm": 0.31100085377693176, "learning_rate": 3.3396232257899116e-06, "loss": 0.8124, "num_input_tokens_seen": 32935048, "step": 57085 }, { "epoch": 8.503127792672029, "grad_norm": 0.19511274993419647, "learning_rate": 3.33637892640479e-06, "loss": 0.779, "num_input_tokens_seen": 32937864, "step": 57090 }, { "epoch": 8.503872505212987, "grad_norm": 0.26899078488349915, "learning_rate": 3.3331360909590994e-06, "loss": 0.8119, "num_input_tokens_seen": 32940680, "step": 57095 }, { "epoch": 8.504617217753946, "grad_norm": 0.18974162638187408, "learning_rate": 3.3298947196719776e-06, "loss": 0.7751, "num_input_tokens_seen": 32943208, "step": 57100 }, { "epoch": 8.505361930294907, "grad_norm": 0.23090702295303345, "learning_rate": 3.326654812762467e-06, "loss": 0.8441, "num_input_tokens_seen": 32945896, "step": 57105 }, { "epoch": 8.506106642835865, "grad_norm": 0.20667871832847595, "learning_rate": 3.3234163704495086e-06, "loss": 0.8127, "num_input_tokens_seen": 32948776, "step": 57110 }, { "epoch": 8.506851355376824, "grad_norm": 0.2949349284172058, "learning_rate": 3.3201793929519386e-06, "loss": 0.8191, "num_input_tokens_seen": 32951496, "step": 57115 }, { "epoch": 8.507596067917785, "grad_norm": 0.2750449478626251, "learning_rate": 3.316943880488507e-06, "loss": 0.8238, "num_input_tokens_seen": 32954312, "step": 57120 }, { "epoch": 8.508340780458743, "grad_norm": 0.4932537376880646, "learning_rate": 3.313709833277853e-06, "loss": 0.818, "num_input_tokens_seen": 32957224, "step": 57125 }, { "epoch": 8.509085492999702, "grad_norm": 0.17910033464431763, "learning_rate": 3.310477251538513e-06, "loss": 0.7852, "num_input_tokens_seen": 32960360, "step": 57130 }, { "epoch": 8.509830205540661, "grad_norm": 0.2560344934463501, "learning_rate": 3.3072461354889367e-06, "loss": 0.8222, "num_input_tokens_seen": 32963208, "step": 57135 }, { "epoch": 8.51057491808162, "grad_norm": 0.2631717920303345, "learning_rate": 3.304016485347469e-06, "loss": 0.772, "num_input_tokens_seen": 32966088, "step": 57140 }, { "epoch": 8.51131963062258, "grad_norm": 0.19221989810466766, "learning_rate": 3.300788301332361e-06, "loss": 0.8196, "num_input_tokens_seen": 32969032, "step": 57145 }, { "epoch": 8.512064343163539, "grad_norm": 0.18906757235527039, "learning_rate": 3.297561583661754e-06, "loss": 0.8059, "num_input_tokens_seen": 32971752, "step": 57150 }, { "epoch": 8.512809055704498, "grad_norm": 0.23164919018745422, "learning_rate": 3.2943363325537046e-06, "loss": 0.8057, "num_input_tokens_seen": 32974600, "step": 57155 }, { "epoch": 8.513553768245457, "grad_norm": 0.22028224170207977, "learning_rate": 3.2911125482261577e-06, "loss": 0.811, "num_input_tokens_seen": 32977448, "step": 57160 }, { "epoch": 8.514298480786417, "grad_norm": 0.18319720029830933, "learning_rate": 3.287890230896959e-06, "loss": 0.8198, "num_input_tokens_seen": 32980072, "step": 57165 }, { "epoch": 8.515043193327376, "grad_norm": 0.2428046613931656, "learning_rate": 3.284669380783864e-06, "loss": 0.8247, "num_input_tokens_seen": 32982920, "step": 57170 }, { "epoch": 8.515787905868335, "grad_norm": 0.21754205226898193, "learning_rate": 3.2814499981045217e-06, "loss": 0.8021, "num_input_tokens_seen": 32985800, "step": 57175 }, { "epoch": 8.516532618409293, "grad_norm": 0.15912911295890808, "learning_rate": 3.2782320830764877e-06, "loss": 0.8065, "num_input_tokens_seen": 32988808, "step": 57180 }, { "epoch": 8.517277330950254, "grad_norm": 0.19486196339130402, "learning_rate": 3.2750156359172224e-06, "loss": 0.7891, "num_input_tokens_seen": 32991560, "step": 57185 }, { "epoch": 8.518022043491213, "grad_norm": 0.2932376563549042, "learning_rate": 3.271800656844065e-06, "loss": 0.8013, "num_input_tokens_seen": 32994440, "step": 57190 }, { "epoch": 8.518766756032171, "grad_norm": 0.19494515657424927, "learning_rate": 3.268587146074281e-06, "loss": 0.8021, "num_input_tokens_seen": 32997096, "step": 57195 }, { "epoch": 8.51951146857313, "grad_norm": 0.19416548311710358, "learning_rate": 3.26537510382503e-06, "loss": 0.7711, "num_input_tokens_seen": 33000232, "step": 57200 }, { "epoch": 8.52025618111409, "grad_norm": 0.24577055871486664, "learning_rate": 3.2621645303133553e-06, "loss": 0.7926, "num_input_tokens_seen": 33003368, "step": 57205 }, { "epoch": 8.52100089365505, "grad_norm": 0.21109504997730255, "learning_rate": 3.2589554257562243e-06, "loss": 0.8142, "num_input_tokens_seen": 33006536, "step": 57210 }, { "epoch": 8.521745606196008, "grad_norm": 0.16323693096637726, "learning_rate": 3.255747790370489e-06, "loss": 0.7888, "num_input_tokens_seen": 33009640, "step": 57215 }, { "epoch": 8.522490318736967, "grad_norm": 0.21462656557559967, "learning_rate": 3.2525416243729236e-06, "loss": 0.8031, "num_input_tokens_seen": 33012648, "step": 57220 }, { "epoch": 8.523235031277927, "grad_norm": 0.28477025032043457, "learning_rate": 3.2493369279801677e-06, "loss": 0.8078, "num_input_tokens_seen": 33015976, "step": 57225 }, { "epoch": 8.523979743818886, "grad_norm": 0.1668401062488556, "learning_rate": 3.2461337014087907e-06, "loss": 0.827, "num_input_tokens_seen": 33018600, "step": 57230 }, { "epoch": 8.524724456359845, "grad_norm": 0.2985784709453583, "learning_rate": 3.242931944875252e-06, "loss": 0.8134, "num_input_tokens_seen": 33021736, "step": 57235 }, { "epoch": 8.525469168900804, "grad_norm": 0.18480443954467773, "learning_rate": 3.239731658595921e-06, "loss": 0.7714, "num_input_tokens_seen": 33024808, "step": 57240 }, { "epoch": 8.526213881441764, "grad_norm": 0.22895920276641846, "learning_rate": 3.236532842787049e-06, "loss": 0.7807, "num_input_tokens_seen": 33027784, "step": 57245 }, { "epoch": 8.526958593982723, "grad_norm": 0.224901020526886, "learning_rate": 3.233335497664805e-06, "loss": 0.8072, "num_input_tokens_seen": 33030600, "step": 57250 }, { "epoch": 8.527703306523682, "grad_norm": 0.19548045098781586, "learning_rate": 3.230139623445255e-06, "loss": 0.799, "num_input_tokens_seen": 33033160, "step": 57255 }, { "epoch": 8.52844801906464, "grad_norm": 0.24939781427383423, "learning_rate": 3.2269452203443546e-06, "loss": 0.798, "num_input_tokens_seen": 33035912, "step": 57260 }, { "epoch": 8.529192731605601, "grad_norm": 0.3421763479709625, "learning_rate": 3.2237522885779718e-06, "loss": 0.7892, "num_input_tokens_seen": 33038920, "step": 57265 }, { "epoch": 8.52993744414656, "grad_norm": 0.22080372273921967, "learning_rate": 3.220560828361874e-06, "loss": 0.7888, "num_input_tokens_seen": 33041608, "step": 57270 }, { "epoch": 8.530682156687519, "grad_norm": 0.20223073661327362, "learning_rate": 3.217370839911729e-06, "loss": 0.7914, "num_input_tokens_seen": 33044168, "step": 57275 }, { "epoch": 8.531426869228477, "grad_norm": 0.1864350438117981, "learning_rate": 3.2141823234431045e-06, "loss": 0.7897, "num_input_tokens_seen": 33046856, "step": 57280 }, { "epoch": 8.532171581769436, "grad_norm": 0.250872403383255, "learning_rate": 3.2109952791714583e-06, "loss": 0.8078, "num_input_tokens_seen": 33049736, "step": 57285 }, { "epoch": 8.532916294310397, "grad_norm": 0.1880691796541214, "learning_rate": 3.2078097073121704e-06, "loss": 0.7968, "num_input_tokens_seen": 33052232, "step": 57290 }, { "epoch": 8.533661006851355, "grad_norm": 0.24059730768203735, "learning_rate": 3.2046256080804943e-06, "loss": 0.7821, "num_input_tokens_seen": 33055080, "step": 57295 }, { "epoch": 8.534405719392314, "grad_norm": 0.16312171518802643, "learning_rate": 3.2014429816916074e-06, "loss": 0.8001, "num_input_tokens_seen": 33057576, "step": 57300 }, { "epoch": 8.535150431933273, "grad_norm": 0.2214759886264801, "learning_rate": 3.198261828360577e-06, "loss": 0.804, "num_input_tokens_seen": 33060328, "step": 57305 }, { "epoch": 8.535895144474233, "grad_norm": 0.17966137826442719, "learning_rate": 3.1950821483023723e-06, "loss": 0.7874, "num_input_tokens_seen": 33063016, "step": 57310 }, { "epoch": 8.536639857015192, "grad_norm": 0.2307722419500351, "learning_rate": 3.191903941731866e-06, "loss": 0.8067, "num_input_tokens_seen": 33065800, "step": 57315 }, { "epoch": 8.537384569556151, "grad_norm": 0.33049410581588745, "learning_rate": 3.188727208863829e-06, "loss": 0.785, "num_input_tokens_seen": 33069032, "step": 57320 }, { "epoch": 8.53812928209711, "grad_norm": 0.24567340314388275, "learning_rate": 3.1855519499129293e-06, "loss": 0.8007, "num_input_tokens_seen": 33071816, "step": 57325 }, { "epoch": 8.53887399463807, "grad_norm": 0.1703004390001297, "learning_rate": 3.1823781650937328e-06, "loss": 0.7794, "num_input_tokens_seen": 33074760, "step": 57330 }, { "epoch": 8.539618707179029, "grad_norm": 0.20512226223945618, "learning_rate": 3.1792058546207174e-06, "loss": 0.8369, "num_input_tokens_seen": 33077928, "step": 57335 }, { "epoch": 8.540363419719988, "grad_norm": 0.1861990988254547, "learning_rate": 3.176035018708251e-06, "loss": 0.8117, "num_input_tokens_seen": 33080680, "step": 57340 }, { "epoch": 8.541108132260947, "grad_norm": 0.16571390628814697, "learning_rate": 3.1728656575706118e-06, "loss": 0.7866, "num_input_tokens_seen": 33083400, "step": 57345 }, { "epoch": 8.541852844801907, "grad_norm": 0.2480134218931198, "learning_rate": 3.16969777142197e-06, "loss": 0.7871, "num_input_tokens_seen": 33085928, "step": 57350 }, { "epoch": 8.542597557342866, "grad_norm": 0.21574588119983673, "learning_rate": 3.1665313604763937e-06, "loss": 0.816, "num_input_tokens_seen": 33088712, "step": 57355 }, { "epoch": 8.543342269883825, "grad_norm": 0.21720241010189056, "learning_rate": 3.163366424947864e-06, "loss": 0.8184, "num_input_tokens_seen": 33091432, "step": 57360 }, { "epoch": 8.544086982424783, "grad_norm": 0.23604612052440643, "learning_rate": 3.1602029650502463e-06, "loss": 0.78, "num_input_tokens_seen": 33094056, "step": 57365 }, { "epoch": 8.544831694965744, "grad_norm": 0.192279651761055, "learning_rate": 3.1570409809973165e-06, "loss": 0.7772, "num_input_tokens_seen": 33096968, "step": 57370 }, { "epoch": 8.545576407506703, "grad_norm": 0.18416093289852142, "learning_rate": 3.153880473002752e-06, "loss": 0.7845, "num_input_tokens_seen": 33099976, "step": 57375 }, { "epoch": 8.546321120047661, "grad_norm": 0.20715013146400452, "learning_rate": 3.1507214412801243e-06, "loss": 0.8084, "num_input_tokens_seen": 33102920, "step": 57380 }, { "epoch": 8.54706583258862, "grad_norm": 0.3393325209617615, "learning_rate": 3.1475638860429147e-06, "loss": 0.7651, "num_input_tokens_seen": 33106088, "step": 57385 }, { "epoch": 8.54781054512958, "grad_norm": 0.23734581470489502, "learning_rate": 3.1444078075044873e-06, "loss": 0.782, "num_input_tokens_seen": 33109000, "step": 57390 }, { "epoch": 8.54855525767054, "grad_norm": 0.2330792397260666, "learning_rate": 3.1412532058781198e-06, "loss": 0.7934, "num_input_tokens_seen": 33111848, "step": 57395 }, { "epoch": 8.549299970211498, "grad_norm": 0.3473508059978485, "learning_rate": 3.138100081376996e-06, "loss": 0.7831, "num_input_tokens_seen": 33115880, "step": 57400 }, { "epoch": 8.550044682752457, "grad_norm": 0.22656656801700592, "learning_rate": 3.13494843421418e-06, "loss": 0.7774, "num_input_tokens_seen": 33118280, "step": 57405 }, { "epoch": 8.550789395293418, "grad_norm": 0.22744250297546387, "learning_rate": 3.1317982646026507e-06, "loss": 0.8173, "num_input_tokens_seen": 33120840, "step": 57410 }, { "epoch": 8.551534107834376, "grad_norm": 0.22108009457588196, "learning_rate": 3.128649572755285e-06, "loss": 0.8006, "num_input_tokens_seen": 33123464, "step": 57415 }, { "epoch": 8.552278820375335, "grad_norm": 0.315799355506897, "learning_rate": 3.125502358884866e-06, "loss": 0.7851, "num_input_tokens_seen": 33126632, "step": 57420 }, { "epoch": 8.553023532916294, "grad_norm": 0.15486153960227966, "learning_rate": 3.1223566232040564e-06, "loss": 0.797, "num_input_tokens_seen": 33129544, "step": 57425 }, { "epoch": 8.553768245457253, "grad_norm": 0.2509444057941437, "learning_rate": 3.1192123659254364e-06, "loss": 0.8421, "num_input_tokens_seen": 33132488, "step": 57430 }, { "epoch": 8.554512957998213, "grad_norm": 0.18417176604270935, "learning_rate": 3.116069587261486e-06, "loss": 0.7809, "num_input_tokens_seen": 33135368, "step": 57435 }, { "epoch": 8.555257670539172, "grad_norm": 0.2302974909543991, "learning_rate": 3.1129282874245826e-06, "loss": 0.8072, "num_input_tokens_seen": 33138184, "step": 57440 }, { "epoch": 8.55600238308013, "grad_norm": 0.254868745803833, "learning_rate": 3.109788466626995e-06, "loss": 0.7971, "num_input_tokens_seen": 33141320, "step": 57445 }, { "epoch": 8.556747095621091, "grad_norm": 0.2707544267177582, "learning_rate": 3.106650125080904e-06, "loss": 0.8216, "num_input_tokens_seen": 33144040, "step": 57450 }, { "epoch": 8.55749180816205, "grad_norm": 0.188860684633255, "learning_rate": 3.103513262998392e-06, "loss": 0.7962, "num_input_tokens_seen": 33146632, "step": 57455 }, { "epoch": 8.558236520703009, "grad_norm": 0.21853047609329224, "learning_rate": 3.1003778805914207e-06, "loss": 0.7928, "num_input_tokens_seen": 33149352, "step": 57460 }, { "epoch": 8.558981233243967, "grad_norm": 0.16643628478050232, "learning_rate": 3.0972439780718786e-06, "loss": 0.7979, "num_input_tokens_seen": 33152232, "step": 57465 }, { "epoch": 8.559725945784926, "grad_norm": 0.15056024491786957, "learning_rate": 3.0941115556515355e-06, "loss": 0.8094, "num_input_tokens_seen": 33154984, "step": 57470 }, { "epoch": 8.560470658325887, "grad_norm": 0.24521198868751526, "learning_rate": 3.0909806135420714e-06, "loss": 0.8248, "num_input_tokens_seen": 33157928, "step": 57475 }, { "epoch": 8.561215370866845, "grad_norm": 0.34092599153518677, "learning_rate": 3.0878511519550623e-06, "loss": 0.7999, "num_input_tokens_seen": 33160968, "step": 57480 }, { "epoch": 8.561960083407804, "grad_norm": 0.2890431582927704, "learning_rate": 3.0847231711019884e-06, "loss": 0.7854, "num_input_tokens_seen": 33163688, "step": 57485 }, { "epoch": 8.562704795948763, "grad_norm": 0.47764208912849426, "learning_rate": 3.0815966711942227e-06, "loss": 0.8045, "num_input_tokens_seen": 33166792, "step": 57490 }, { "epoch": 8.563449508489724, "grad_norm": 0.22958588600158691, "learning_rate": 3.078471652443035e-06, "loss": 0.7963, "num_input_tokens_seen": 33169576, "step": 57495 }, { "epoch": 8.564194221030682, "grad_norm": 0.2003105729818344, "learning_rate": 3.0753481150596038e-06, "loss": 0.8083, "num_input_tokens_seen": 33172584, "step": 57500 }, { "epoch": 8.564938933571641, "grad_norm": 0.17303767800331116, "learning_rate": 3.072226059255012e-06, "loss": 0.7836, "num_input_tokens_seen": 33175464, "step": 57505 }, { "epoch": 8.5656836461126, "grad_norm": 0.23480798304080963, "learning_rate": 3.0691054852402286e-06, "loss": 0.7777, "num_input_tokens_seen": 33178248, "step": 57510 }, { "epoch": 8.56642835865356, "grad_norm": 0.23828063905239105, "learning_rate": 3.065986393226139e-06, "loss": 0.7992, "num_input_tokens_seen": 33181448, "step": 57515 }, { "epoch": 8.567173071194519, "grad_norm": 0.2133857011795044, "learning_rate": 3.0628687834235032e-06, "loss": 0.7759, "num_input_tokens_seen": 33184040, "step": 57520 }, { "epoch": 8.567917783735478, "grad_norm": 0.27630001306533813, "learning_rate": 3.0597526560430133e-06, "loss": 0.8117, "num_input_tokens_seen": 33186984, "step": 57525 }, { "epoch": 8.568662496276437, "grad_norm": 0.21600191295146942, "learning_rate": 3.056638011295229e-06, "loss": 0.821, "num_input_tokens_seen": 33189896, "step": 57530 }, { "epoch": 8.569407208817397, "grad_norm": 0.21167661249637604, "learning_rate": 3.053524849390635e-06, "loss": 0.801, "num_input_tokens_seen": 33192680, "step": 57535 }, { "epoch": 8.570151921358356, "grad_norm": 0.25064730644226074, "learning_rate": 3.050413170539604e-06, "loss": 0.7777, "num_input_tokens_seen": 33195368, "step": 57540 }, { "epoch": 8.570896633899315, "grad_norm": 0.2520010769367218, "learning_rate": 3.0473029749524094e-06, "loss": 0.7981, "num_input_tokens_seen": 33198280, "step": 57545 }, { "epoch": 8.571641346440273, "grad_norm": 0.23066586256027222, "learning_rate": 3.044194262839231e-06, "loss": 0.8083, "num_input_tokens_seen": 33201352, "step": 57550 }, { "epoch": 8.572386058981234, "grad_norm": 0.3013802170753479, "learning_rate": 3.041087034410134e-06, "loss": 0.7998, "num_input_tokens_seen": 33204040, "step": 57555 }, { "epoch": 8.573130771522193, "grad_norm": 0.23613351583480835, "learning_rate": 3.037981289875097e-06, "loss": 0.8009, "num_input_tokens_seen": 33207016, "step": 57560 }, { "epoch": 8.573875484063151, "grad_norm": 0.21956630051136017, "learning_rate": 3.0348770294439973e-06, "loss": 0.8088, "num_input_tokens_seen": 33209832, "step": 57565 }, { "epoch": 8.57462019660411, "grad_norm": 0.24577556550502777, "learning_rate": 3.0317742533266024e-06, "loss": 0.7862, "num_input_tokens_seen": 33212328, "step": 57570 }, { "epoch": 8.57536490914507, "grad_norm": 0.2964596748352051, "learning_rate": 3.0286729617325844e-06, "loss": 0.8063, "num_input_tokens_seen": 33215240, "step": 57575 }, { "epoch": 8.57610962168603, "grad_norm": 0.24950605630874634, "learning_rate": 3.0255731548715195e-06, "loss": 0.8082, "num_input_tokens_seen": 33218152, "step": 57580 }, { "epoch": 8.576854334226988, "grad_norm": 0.17796091735363007, "learning_rate": 3.0224748329528846e-06, "loss": 0.7957, "num_input_tokens_seen": 33220872, "step": 57585 }, { "epoch": 8.577599046767947, "grad_norm": 0.20612815022468567, "learning_rate": 3.0193779961860403e-06, "loss": 0.8261, "num_input_tokens_seen": 33224008, "step": 57590 }, { "epoch": 8.578343759308908, "grad_norm": 0.2625673711299896, "learning_rate": 3.0162826447802634e-06, "loss": 0.8067, "num_input_tokens_seen": 33226888, "step": 57595 }, { "epoch": 8.579088471849866, "grad_norm": 0.2326028048992157, "learning_rate": 3.0131887789447284e-06, "loss": 0.7942, "num_input_tokens_seen": 33229736, "step": 57600 }, { "epoch": 8.579833184390825, "grad_norm": 0.3095293641090393, "learning_rate": 3.0100963988885067e-06, "loss": 0.8035, "num_input_tokens_seen": 33232808, "step": 57605 }, { "epoch": 8.580577896931784, "grad_norm": 0.31394636631011963, "learning_rate": 3.0070055048205647e-06, "loss": 0.7831, "num_input_tokens_seen": 33235784, "step": 57610 }, { "epoch": 8.581322609472743, "grad_norm": 0.265205442905426, "learning_rate": 3.003916096949769e-06, "loss": 0.8008, "num_input_tokens_seen": 33238696, "step": 57615 }, { "epoch": 8.582067322013703, "grad_norm": 0.29132047295570374, "learning_rate": 3.0008281754849018e-06, "loss": 0.7956, "num_input_tokens_seen": 33241448, "step": 57620 }, { "epoch": 8.582812034554662, "grad_norm": 0.2969840168952942, "learning_rate": 2.9977417406346186e-06, "loss": 0.808, "num_input_tokens_seen": 33244296, "step": 57625 }, { "epoch": 8.58355674709562, "grad_norm": 0.19577781856060028, "learning_rate": 2.994656792607495e-06, "loss": 0.7672, "num_input_tokens_seen": 33247176, "step": 57630 }, { "epoch": 8.584301459636581, "grad_norm": 0.2264270931482315, "learning_rate": 2.9915733316119963e-06, "loss": 0.7968, "num_input_tokens_seen": 33250088, "step": 57635 }, { "epoch": 8.58504617217754, "grad_norm": 0.20040035247802734, "learning_rate": 2.988491357856493e-06, "loss": 0.7766, "num_input_tokens_seen": 33253192, "step": 57640 }, { "epoch": 8.585790884718499, "grad_norm": 0.27203378081321716, "learning_rate": 2.9854108715492572e-06, "loss": 0.7845, "num_input_tokens_seen": 33255880, "step": 57645 }, { "epoch": 8.586535597259457, "grad_norm": 0.19560426473617554, "learning_rate": 2.9823318728984447e-06, "loss": 0.8007, "num_input_tokens_seen": 33258824, "step": 57650 }, { "epoch": 8.587280309800416, "grad_norm": 0.20338688790798187, "learning_rate": 2.97925436211213e-06, "loss": 0.8076, "num_input_tokens_seen": 33261704, "step": 57655 }, { "epoch": 8.588025022341377, "grad_norm": 0.24025247991085052, "learning_rate": 2.9761783393982722e-06, "loss": 0.7725, "num_input_tokens_seen": 33264872, "step": 57660 }, { "epoch": 8.588769734882336, "grad_norm": 0.13651499152183533, "learning_rate": 2.9731038049647385e-06, "loss": 0.7596, "num_input_tokens_seen": 33267944, "step": 57665 }, { "epoch": 8.589514447423294, "grad_norm": 0.31038010120391846, "learning_rate": 2.970030759019296e-06, "loss": 0.7989, "num_input_tokens_seen": 33271144, "step": 57670 }, { "epoch": 8.590259159964253, "grad_norm": 0.18894940614700317, "learning_rate": 2.966959201769609e-06, "loss": 0.8043, "num_input_tokens_seen": 33273672, "step": 57675 }, { "epoch": 8.591003872505214, "grad_norm": 0.2283015102148056, "learning_rate": 2.963889133423242e-06, "loss": 0.7953, "num_input_tokens_seen": 33276584, "step": 57680 }, { "epoch": 8.591748585046172, "grad_norm": 0.14810624718666077, "learning_rate": 2.9608205541876516e-06, "loss": 0.7775, "num_input_tokens_seen": 33279304, "step": 57685 }, { "epoch": 8.592493297587131, "grad_norm": 0.22111521661281586, "learning_rate": 2.957753464270208e-06, "loss": 0.7637, "num_input_tokens_seen": 33282088, "step": 57690 }, { "epoch": 8.59323801012809, "grad_norm": 0.20636539161205292, "learning_rate": 2.954687863878164e-06, "loss": 0.8184, "num_input_tokens_seen": 33285128, "step": 57695 }, { "epoch": 8.59398272266905, "grad_norm": 0.256591260433197, "learning_rate": 2.9516237532186826e-06, "loss": 0.7762, "num_input_tokens_seen": 33288040, "step": 57700 }, { "epoch": 8.59472743521001, "grad_norm": 0.23090443015098572, "learning_rate": 2.9485611324988254e-06, "loss": 0.7829, "num_input_tokens_seen": 33290824, "step": 57705 }, { "epoch": 8.595472147750968, "grad_norm": 0.41374197602272034, "learning_rate": 2.9455000019255524e-06, "loss": 0.7973, "num_input_tokens_seen": 33293960, "step": 57710 }, { "epoch": 8.596216860291927, "grad_norm": 0.28631240129470825, "learning_rate": 2.9424403617057285e-06, "loss": 0.8133, "num_input_tokens_seen": 33296712, "step": 57715 }, { "epoch": 8.596961572832887, "grad_norm": 0.26785212755203247, "learning_rate": 2.939382212046099e-06, "loss": 0.8158, "num_input_tokens_seen": 33299560, "step": 57720 }, { "epoch": 8.597706285373846, "grad_norm": 0.2857823967933655, "learning_rate": 2.936325553153335e-06, "loss": 0.7993, "num_input_tokens_seen": 33302600, "step": 57725 }, { "epoch": 8.598450997914805, "grad_norm": 0.25006452202796936, "learning_rate": 2.9332703852339797e-06, "loss": 0.8078, "num_input_tokens_seen": 33305480, "step": 57730 }, { "epoch": 8.599195710455763, "grad_norm": 0.22542709112167358, "learning_rate": 2.930216708494493e-06, "loss": 0.7631, "num_input_tokens_seen": 33308552, "step": 57735 }, { "epoch": 8.599940422996724, "grad_norm": 0.23983162641525269, "learning_rate": 2.927164523141235e-06, "loss": 0.804, "num_input_tokens_seen": 33311208, "step": 57740 }, { "epoch": 8.600685135537683, "grad_norm": 0.19910204410552979, "learning_rate": 2.9241138293804565e-06, "loss": 0.8013, "num_input_tokens_seen": 33313832, "step": 57745 }, { "epoch": 8.601429848078642, "grad_norm": 0.2120739221572876, "learning_rate": 2.9210646274183157e-06, "loss": 0.8068, "num_input_tokens_seen": 33316840, "step": 57750 }, { "epoch": 8.6021745606196, "grad_norm": 0.2051829993724823, "learning_rate": 2.9180169174608555e-06, "loss": 0.7694, "num_input_tokens_seen": 33319784, "step": 57755 }, { "epoch": 8.60291927316056, "grad_norm": 0.19473575055599213, "learning_rate": 2.9149706997140316e-06, "loss": 0.7804, "num_input_tokens_seen": 33322632, "step": 57760 }, { "epoch": 8.60366398570152, "grad_norm": 0.20071928203105927, "learning_rate": 2.911925974383703e-06, "loss": 0.7919, "num_input_tokens_seen": 33325480, "step": 57765 }, { "epoch": 8.604408698242478, "grad_norm": 0.20262880623340607, "learning_rate": 2.908882741675609e-06, "loss": 0.8361, "num_input_tokens_seen": 33328392, "step": 57770 }, { "epoch": 8.605153410783437, "grad_norm": 0.20197753608226776, "learning_rate": 2.9058410017954035e-06, "loss": 0.7964, "num_input_tokens_seen": 33331144, "step": 57775 }, { "epoch": 8.605898123324398, "grad_norm": 0.33871781826019287, "learning_rate": 2.902800754948634e-06, "loss": 0.8167, "num_input_tokens_seen": 33334344, "step": 57780 }, { "epoch": 8.606642835865356, "grad_norm": 0.2621096670627594, "learning_rate": 2.8997620013407557e-06, "loss": 0.7971, "num_input_tokens_seen": 33337192, "step": 57785 }, { "epoch": 8.607387548406315, "grad_norm": 0.2579960227012634, "learning_rate": 2.896724741177101e-06, "loss": 0.7776, "num_input_tokens_seen": 33339912, "step": 57790 }, { "epoch": 8.608132260947274, "grad_norm": 0.22309590876102448, "learning_rate": 2.893688974662925e-06, "loss": 0.7944, "num_input_tokens_seen": 33342600, "step": 57795 }, { "epoch": 8.608876973488233, "grad_norm": 0.19741186499595642, "learning_rate": 2.8906547020033703e-06, "loss": 0.7917, "num_input_tokens_seen": 33345448, "step": 57800 }, { "epoch": 8.609621686029193, "grad_norm": 0.21175654232501984, "learning_rate": 2.887621923403483e-06, "loss": 0.7752, "num_input_tokens_seen": 33348360, "step": 57805 }, { "epoch": 8.610366398570152, "grad_norm": 0.284310907125473, "learning_rate": 2.884590639068202e-06, "loss": 0.7884, "num_input_tokens_seen": 33351400, "step": 57810 }, { "epoch": 8.61111111111111, "grad_norm": 0.2091183066368103, "learning_rate": 2.8815608492023696e-06, "loss": 0.8294, "num_input_tokens_seen": 33354152, "step": 57815 }, { "epoch": 8.61185582365207, "grad_norm": 0.1774836629629135, "learning_rate": 2.878532554010732e-06, "loss": 0.7755, "num_input_tokens_seen": 33357256, "step": 57820 }, { "epoch": 8.61260053619303, "grad_norm": 0.23407670855522156, "learning_rate": 2.875505753697921e-06, "loss": 0.8041, "num_input_tokens_seen": 33360072, "step": 57825 }, { "epoch": 8.613345248733989, "grad_norm": 0.18253889679908752, "learning_rate": 2.8724804484684785e-06, "loss": 0.8162, "num_input_tokens_seen": 33362888, "step": 57830 }, { "epoch": 8.614089961274948, "grad_norm": 0.27646657824516296, "learning_rate": 2.8694566385268463e-06, "loss": 0.8109, "num_input_tokens_seen": 33365640, "step": 57835 }, { "epoch": 8.614834673815906, "grad_norm": 0.19758641719818115, "learning_rate": 2.866434324077355e-06, "loss": 0.7959, "num_input_tokens_seen": 33368648, "step": 57840 }, { "epoch": 8.615579386356867, "grad_norm": 0.17572090029716492, "learning_rate": 2.86341350532425e-06, "loss": 0.7982, "num_input_tokens_seen": 33371464, "step": 57845 }, { "epoch": 8.616324098897826, "grad_norm": 0.27213624119758606, "learning_rate": 2.8603941824716542e-06, "loss": 0.8217, "num_input_tokens_seen": 33374280, "step": 57850 }, { "epoch": 8.617068811438784, "grad_norm": 0.3012712299823761, "learning_rate": 2.857376355723612e-06, "loss": 0.7948, "num_input_tokens_seen": 33377096, "step": 57855 }, { "epoch": 8.617813523979743, "grad_norm": 0.20850074291229248, "learning_rate": 2.8543600252840448e-06, "loss": 0.7875, "num_input_tokens_seen": 33380008, "step": 57860 }, { "epoch": 8.618558236520704, "grad_norm": 0.24945440888404846, "learning_rate": 2.8513451913567883e-06, "loss": 0.7863, "num_input_tokens_seen": 33382888, "step": 57865 }, { "epoch": 8.619302949061662, "grad_norm": 0.21740342676639557, "learning_rate": 2.848331854145575e-06, "loss": 0.7896, "num_input_tokens_seen": 33385896, "step": 57870 }, { "epoch": 8.620047661602621, "grad_norm": 0.20315320789813995, "learning_rate": 2.845320013854033e-06, "loss": 0.7848, "num_input_tokens_seen": 33388744, "step": 57875 }, { "epoch": 8.62079237414358, "grad_norm": 0.23307758569717407, "learning_rate": 2.8423096706856973e-06, "loss": 0.7971, "num_input_tokens_seen": 33391592, "step": 57880 }, { "epoch": 8.62153708668454, "grad_norm": 0.22317759692668915, "learning_rate": 2.839300824843985e-06, "loss": 0.8056, "num_input_tokens_seen": 33394600, "step": 57885 }, { "epoch": 8.6222817992255, "grad_norm": 0.2246987521648407, "learning_rate": 2.8362934765322174e-06, "loss": 0.7956, "num_input_tokens_seen": 33397448, "step": 57890 }, { "epoch": 8.623026511766458, "grad_norm": 0.19106905162334442, "learning_rate": 2.833287625953629e-06, "loss": 0.8069, "num_input_tokens_seen": 33400392, "step": 57895 }, { "epoch": 8.623771224307417, "grad_norm": 0.308512419462204, "learning_rate": 2.8302832733113376e-06, "loss": 0.7879, "num_input_tokens_seen": 33403432, "step": 57900 }, { "epoch": 8.624515936848377, "grad_norm": 0.24316982924938202, "learning_rate": 2.8272804188083675e-06, "loss": 0.8112, "num_input_tokens_seen": 33406312, "step": 57905 }, { "epoch": 8.625260649389336, "grad_norm": 0.24221402406692505, "learning_rate": 2.824279062647639e-06, "loss": 0.7935, "num_input_tokens_seen": 33409256, "step": 57910 }, { "epoch": 8.626005361930295, "grad_norm": 0.27743908762931824, "learning_rate": 2.8212792050319766e-06, "loss": 0.7774, "num_input_tokens_seen": 33412296, "step": 57915 }, { "epoch": 8.626750074471254, "grad_norm": 0.2956455945968628, "learning_rate": 2.8182808461640897e-06, "loss": 0.7909, "num_input_tokens_seen": 33415112, "step": 57920 }, { "epoch": 8.627494787012214, "grad_norm": 0.22730480134487152, "learning_rate": 2.8152839862466027e-06, "loss": 0.7797, "num_input_tokens_seen": 33417704, "step": 57925 }, { "epoch": 8.628239499553173, "grad_norm": 0.23987402021884918, "learning_rate": 2.812288625482021e-06, "loss": 0.8223, "num_input_tokens_seen": 33420648, "step": 57930 }, { "epoch": 8.628984212094132, "grad_norm": 0.20294417440891266, "learning_rate": 2.8092947640727673e-06, "loss": 0.798, "num_input_tokens_seen": 33423592, "step": 57935 }, { "epoch": 8.62972892463509, "grad_norm": 0.29163992404937744, "learning_rate": 2.8063024022211533e-06, "loss": 0.8003, "num_input_tokens_seen": 33426376, "step": 57940 }, { "epoch": 8.63047363717605, "grad_norm": 0.2595879137516022, "learning_rate": 2.8033115401293884e-06, "loss": 0.7969, "num_input_tokens_seen": 33429864, "step": 57945 }, { "epoch": 8.63121834971701, "grad_norm": 0.2596365213394165, "learning_rate": 2.80032217799959e-06, "loss": 0.7967, "num_input_tokens_seen": 33433000, "step": 57950 }, { "epoch": 8.631963062257968, "grad_norm": 0.2241058647632599, "learning_rate": 2.7973343160337562e-06, "loss": 0.7826, "num_input_tokens_seen": 33435752, "step": 57955 }, { "epoch": 8.632707774798927, "grad_norm": 0.20815736055374146, "learning_rate": 2.7943479544337988e-06, "loss": 0.8082, "num_input_tokens_seen": 33438216, "step": 57960 }, { "epoch": 8.633452487339888, "grad_norm": 0.23484784364700317, "learning_rate": 2.7913630934015304e-06, "loss": 0.8014, "num_input_tokens_seen": 33441128, "step": 57965 }, { "epoch": 8.634197199880846, "grad_norm": 0.18589255213737488, "learning_rate": 2.7883797331386465e-06, "loss": 0.7717, "num_input_tokens_seen": 33444008, "step": 57970 }, { "epoch": 8.634941912421805, "grad_norm": 0.29442405700683594, "learning_rate": 2.785397873846754e-06, "loss": 0.8066, "num_input_tokens_seen": 33446696, "step": 57975 }, { "epoch": 8.635686624962764, "grad_norm": 0.2265525609254837, "learning_rate": 2.7824175157273564e-06, "loss": 0.7867, "num_input_tokens_seen": 33449640, "step": 57980 }, { "epoch": 8.636431337503723, "grad_norm": 0.18456247448921204, "learning_rate": 2.779438658981856e-06, "loss": 0.7716, "num_input_tokens_seen": 33452456, "step": 57985 }, { "epoch": 8.637176050044683, "grad_norm": 0.16563370823860168, "learning_rate": 2.776461303811545e-06, "loss": 0.7938, "num_input_tokens_seen": 33455144, "step": 57990 }, { "epoch": 8.637920762585642, "grad_norm": 0.3246537446975708, "learning_rate": 2.7734854504176234e-06, "loss": 0.8302, "num_input_tokens_seen": 33457832, "step": 57995 }, { "epoch": 8.6386654751266, "grad_norm": 0.23346954584121704, "learning_rate": 2.770511099001191e-06, "loss": 0.7744, "num_input_tokens_seen": 33460456, "step": 58000 }, { "epoch": 8.63941018766756, "grad_norm": 0.27235692739486694, "learning_rate": 2.7675382497632435e-06, "loss": 0.8242, "num_input_tokens_seen": 33463336, "step": 58005 }, { "epoch": 8.64015490020852, "grad_norm": 0.27366772294044495, "learning_rate": 2.764566902904664e-06, "loss": 0.7749, "num_input_tokens_seen": 33466248, "step": 58010 }, { "epoch": 8.640899612749479, "grad_norm": 0.1707860827445984, "learning_rate": 2.761597058626253e-06, "loss": 0.773, "num_input_tokens_seen": 33469224, "step": 58015 }, { "epoch": 8.641644325290438, "grad_norm": 0.23586419224739075, "learning_rate": 2.758628717128703e-06, "loss": 0.7755, "num_input_tokens_seen": 33472488, "step": 58020 }, { "epoch": 8.642389037831396, "grad_norm": 0.25537174940109253, "learning_rate": 2.755661878612592e-06, "loss": 0.801, "num_input_tokens_seen": 33475304, "step": 58025 }, { "epoch": 8.643133750372357, "grad_norm": 0.20181232690811157, "learning_rate": 2.75269654327841e-06, "loss": 0.7791, "num_input_tokens_seen": 33478152, "step": 58030 }, { "epoch": 8.643878462913316, "grad_norm": 0.7028248310089111, "learning_rate": 2.749732711326547e-06, "loss": 0.8189, "num_input_tokens_seen": 33481384, "step": 58035 }, { "epoch": 8.644623175454274, "grad_norm": 0.21417880058288574, "learning_rate": 2.7467703829572836e-06, "loss": 0.8085, "num_input_tokens_seen": 33484200, "step": 58040 }, { "epoch": 8.645367887995233, "grad_norm": 0.20367242395877838, "learning_rate": 2.7438095583708078e-06, "loss": 0.7947, "num_input_tokens_seen": 33487144, "step": 58045 }, { "epoch": 8.646112600536194, "grad_norm": 0.20792487263679504, "learning_rate": 2.740850237767195e-06, "loss": 0.7814, "num_input_tokens_seen": 33490152, "step": 58050 }, { "epoch": 8.646857313077152, "grad_norm": 0.2881222665309906, "learning_rate": 2.737892421346419e-06, "loss": 0.8139, "num_input_tokens_seen": 33492776, "step": 58055 }, { "epoch": 8.647602025618111, "grad_norm": 0.19885504245758057, "learning_rate": 2.7349361093083643e-06, "loss": 0.8008, "num_input_tokens_seen": 33495592, "step": 58060 }, { "epoch": 8.64834673815907, "grad_norm": 0.22363808751106262, "learning_rate": 2.7319813018528013e-06, "loss": 0.8115, "num_input_tokens_seen": 33498440, "step": 58065 }, { "epoch": 8.64909145070003, "grad_norm": 0.26866525411605835, "learning_rate": 2.7290279991794067e-06, "loss": 0.8105, "num_input_tokens_seen": 33501576, "step": 58070 }, { "epoch": 8.64983616324099, "grad_norm": 0.14380474388599396, "learning_rate": 2.7260762014877538e-06, "loss": 0.7729, "num_input_tokens_seen": 33504328, "step": 58075 }, { "epoch": 8.650580875781948, "grad_norm": 0.23868286609649658, "learning_rate": 2.723125908977317e-06, "loss": 0.7659, "num_input_tokens_seen": 33507496, "step": 58080 }, { "epoch": 8.651325588322907, "grad_norm": 0.1998521387577057, "learning_rate": 2.7201771218474558e-06, "loss": 0.7983, "num_input_tokens_seen": 33510312, "step": 58085 }, { "epoch": 8.652070300863867, "grad_norm": 0.19036108255386353, "learning_rate": 2.7172298402974443e-06, "loss": 0.8426, "num_input_tokens_seen": 33512968, "step": 58090 }, { "epoch": 8.652815013404826, "grad_norm": 0.19049017131328583, "learning_rate": 2.7142840645264426e-06, "loss": 0.7773, "num_input_tokens_seen": 33515912, "step": 58095 }, { "epoch": 8.653559725945785, "grad_norm": 0.15328960120677948, "learning_rate": 2.711339794733517e-06, "loss": 0.775, "num_input_tokens_seen": 33518504, "step": 58100 }, { "epoch": 8.654304438486744, "grad_norm": 0.18650096654891968, "learning_rate": 2.7083970311176267e-06, "loss": 0.8016, "num_input_tokens_seen": 33521256, "step": 58105 }, { "epoch": 8.655049151027704, "grad_norm": 0.34114959836006165, "learning_rate": 2.7054557738776356e-06, "loss": 0.7963, "num_input_tokens_seen": 33524136, "step": 58110 }, { "epoch": 8.655793863568663, "grad_norm": 0.1805841028690338, "learning_rate": 2.702516023212304e-06, "loss": 0.7848, "num_input_tokens_seen": 33526792, "step": 58115 }, { "epoch": 8.656538576109622, "grad_norm": 0.26821011304855347, "learning_rate": 2.699577779320278e-06, "loss": 0.7935, "num_input_tokens_seen": 33529416, "step": 58120 }, { "epoch": 8.65728328865058, "grad_norm": 0.16496866941452026, "learning_rate": 2.696641042400122e-06, "loss": 0.7849, "num_input_tokens_seen": 33532296, "step": 58125 }, { "epoch": 8.65802800119154, "grad_norm": 0.21423529088497162, "learning_rate": 2.6937058126502905e-06, "loss": 0.8308, "num_input_tokens_seen": 33535016, "step": 58130 }, { "epoch": 8.6587727137325, "grad_norm": 0.29861980676651, "learning_rate": 2.6907720902691226e-06, "loss": 0.7736, "num_input_tokens_seen": 33537960, "step": 58135 }, { "epoch": 8.659517426273458, "grad_norm": 0.35750991106033325, "learning_rate": 2.6878398754548756e-06, "loss": 0.8015, "num_input_tokens_seen": 33540968, "step": 58140 }, { "epoch": 8.660262138814417, "grad_norm": 0.28952619433403015, "learning_rate": 2.684909168405694e-06, "loss": 0.7969, "num_input_tokens_seen": 33543912, "step": 58145 }, { "epoch": 8.661006851355378, "grad_norm": 0.2722812592983246, "learning_rate": 2.6819799693196283e-06, "loss": 0.8242, "num_input_tokens_seen": 33546856, "step": 58150 }, { "epoch": 8.661751563896336, "grad_norm": 0.22028188407421112, "learning_rate": 2.6790522783946142e-06, "loss": 0.7906, "num_input_tokens_seen": 33549512, "step": 58155 }, { "epoch": 8.662496276437295, "grad_norm": 0.29658544063568115, "learning_rate": 2.676126095828496e-06, "loss": 0.8194, "num_input_tokens_seen": 33552392, "step": 58160 }, { "epoch": 8.663240988978254, "grad_norm": 0.28621160984039307, "learning_rate": 2.673201421819016e-06, "loss": 0.83, "num_input_tokens_seen": 33555304, "step": 58165 }, { "epoch": 8.663985701519213, "grad_norm": 0.19592949748039246, "learning_rate": 2.670278256563813e-06, "loss": 0.8057, "num_input_tokens_seen": 33558312, "step": 58170 }, { "epoch": 8.664730414060173, "grad_norm": 0.1705806404352188, "learning_rate": 2.667356600260415e-06, "loss": 0.8108, "num_input_tokens_seen": 33561288, "step": 58175 }, { "epoch": 8.665475126601132, "grad_norm": 0.22655220329761505, "learning_rate": 2.664436453106259e-06, "loss": 0.7771, "num_input_tokens_seen": 33564200, "step": 58180 }, { "epoch": 8.66621983914209, "grad_norm": 0.2008349746465683, "learning_rate": 2.6615178152986835e-06, "loss": 0.8105, "num_input_tokens_seen": 33567112, "step": 58185 }, { "epoch": 8.66696455168305, "grad_norm": 0.20181749761104584, "learning_rate": 2.6586006870349095e-06, "loss": 0.8094, "num_input_tokens_seen": 33570184, "step": 58190 }, { "epoch": 8.66770926422401, "grad_norm": 0.2296046018600464, "learning_rate": 2.6556850685120648e-06, "loss": 0.7868, "num_input_tokens_seen": 33573096, "step": 58195 }, { "epoch": 8.668453976764969, "grad_norm": 0.1887807846069336, "learning_rate": 2.6527709599271784e-06, "loss": 0.8156, "num_input_tokens_seen": 33575912, "step": 58200 }, { "epoch": 8.669198689305928, "grad_norm": 0.26952069997787476, "learning_rate": 2.649858361477173e-06, "loss": 0.8088, "num_input_tokens_seen": 33578952, "step": 58205 }, { "epoch": 8.669943401846886, "grad_norm": 0.23730650544166565, "learning_rate": 2.6469472733588767e-06, "loss": 0.8025, "num_input_tokens_seen": 33581736, "step": 58210 }, { "epoch": 8.670688114387847, "grad_norm": 0.22673457860946655, "learning_rate": 2.6440376957690026e-06, "loss": 0.7592, "num_input_tokens_seen": 33584648, "step": 58215 }, { "epoch": 8.671432826928806, "grad_norm": 0.23491041362285614, "learning_rate": 2.6411296289041627e-06, "loss": 0.816, "num_input_tokens_seen": 33587752, "step": 58220 }, { "epoch": 8.672177539469764, "grad_norm": 0.2547779977321625, "learning_rate": 2.638223072960877e-06, "loss": 0.815, "num_input_tokens_seen": 33590888, "step": 58225 }, { "epoch": 8.672922252010723, "grad_norm": 0.21760529279708862, "learning_rate": 2.635318028135561e-06, "loss": 0.7919, "num_input_tokens_seen": 33593864, "step": 58230 }, { "epoch": 8.673666964551684, "grad_norm": 0.21379059553146362, "learning_rate": 2.6324144946245244e-06, "loss": 0.8111, "num_input_tokens_seen": 33596968, "step": 58235 }, { "epoch": 8.674411677092642, "grad_norm": 0.24377769231796265, "learning_rate": 2.629512472623974e-06, "loss": 0.7867, "num_input_tokens_seen": 33599592, "step": 58240 }, { "epoch": 8.675156389633601, "grad_norm": 0.2116340845823288, "learning_rate": 2.6266119623300277e-06, "loss": 0.8195, "num_input_tokens_seen": 33602664, "step": 58245 }, { "epoch": 8.67590110217456, "grad_norm": 0.2212195098400116, "learning_rate": 2.6237129639386795e-06, "loss": 0.8041, "num_input_tokens_seen": 33605416, "step": 58250 }, { "epoch": 8.67664581471552, "grad_norm": 0.27901920676231384, "learning_rate": 2.620815477645827e-06, "loss": 0.7942, "num_input_tokens_seen": 33608200, "step": 58255 }, { "epoch": 8.67739052725648, "grad_norm": 0.1974206417798996, "learning_rate": 2.6179195036472815e-06, "loss": 0.788, "num_input_tokens_seen": 33611112, "step": 58260 }, { "epoch": 8.678135239797438, "grad_norm": 0.26098236441612244, "learning_rate": 2.615025042138733e-06, "loss": 0.8083, "num_input_tokens_seen": 33613928, "step": 58265 }, { "epoch": 8.678879952338397, "grad_norm": 0.1700456589460373, "learning_rate": 2.6121320933157834e-06, "loss": 0.789, "num_input_tokens_seen": 33616744, "step": 58270 }, { "epoch": 8.679624664879357, "grad_norm": 0.3081169128417969, "learning_rate": 2.6092406573739264e-06, "loss": 0.7965, "num_input_tokens_seen": 33619752, "step": 58275 }, { "epoch": 8.680369377420316, "grad_norm": 0.2063101977109909, "learning_rate": 2.606350734508553e-06, "loss": 0.7987, "num_input_tokens_seen": 33622632, "step": 58280 }, { "epoch": 8.681114089961275, "grad_norm": 0.43715450167655945, "learning_rate": 2.6034623249149487e-06, "loss": 0.7639, "num_input_tokens_seen": 33625832, "step": 58285 }, { "epoch": 8.681858802502234, "grad_norm": 0.14985375106334686, "learning_rate": 2.6005754287883072e-06, "loss": 0.7905, "num_input_tokens_seen": 33628744, "step": 58290 }, { "epoch": 8.682603515043194, "grad_norm": 0.20736931264400482, "learning_rate": 2.597690046323703e-06, "loss": 0.8, "num_input_tokens_seen": 33631592, "step": 58295 }, { "epoch": 8.683348227584153, "grad_norm": 0.18292932212352753, "learning_rate": 2.594806177716125e-06, "loss": 0.7965, "num_input_tokens_seen": 33634248, "step": 58300 }, { "epoch": 8.684092940125112, "grad_norm": 0.33925697207450867, "learning_rate": 2.5919238231604524e-06, "loss": 0.8032, "num_input_tokens_seen": 33637288, "step": 58305 }, { "epoch": 8.68483765266607, "grad_norm": 0.2111222892999649, "learning_rate": 2.589042982851461e-06, "loss": 0.7853, "num_input_tokens_seen": 33640040, "step": 58310 }, { "epoch": 8.68558236520703, "grad_norm": 0.3275025188922882, "learning_rate": 2.5861636569838366e-06, "loss": 0.8332, "num_input_tokens_seen": 33642888, "step": 58315 }, { "epoch": 8.68632707774799, "grad_norm": 0.24027638137340546, "learning_rate": 2.583285845752137e-06, "loss": 0.8004, "num_input_tokens_seen": 33645736, "step": 58320 }, { "epoch": 8.687071790288948, "grad_norm": 0.22462069988250732, "learning_rate": 2.580409549350843e-06, "loss": 0.7876, "num_input_tokens_seen": 33648360, "step": 58325 }, { "epoch": 8.687816502829907, "grad_norm": 0.1974562555551529, "learning_rate": 2.577534767974324e-06, "loss": 0.7865, "num_input_tokens_seen": 33651240, "step": 58330 }, { "epoch": 8.688561215370868, "grad_norm": 0.2675101161003113, "learning_rate": 2.574661501816836e-06, "loss": 0.7879, "num_input_tokens_seen": 33654408, "step": 58335 }, { "epoch": 8.689305927911827, "grad_norm": 0.23492059111595154, "learning_rate": 2.5717897510725508e-06, "loss": 0.7914, "num_input_tokens_seen": 33657448, "step": 58340 }, { "epoch": 8.690050640452785, "grad_norm": 0.3173281252384186, "learning_rate": 2.568919515935525e-06, "loss": 0.8069, "num_input_tokens_seen": 33660936, "step": 58345 }, { "epoch": 8.690795352993744, "grad_norm": 0.22687479853630066, "learning_rate": 2.5660507965997282e-06, "loss": 0.7981, "num_input_tokens_seen": 33663880, "step": 58350 }, { "epoch": 8.691540065534703, "grad_norm": 0.22020208835601807, "learning_rate": 2.5631835932590027e-06, "loss": 0.7809, "num_input_tokens_seen": 33666696, "step": 58355 }, { "epoch": 8.692284778075663, "grad_norm": 0.3191041350364685, "learning_rate": 2.5603179061071097e-06, "loss": 0.7978, "num_input_tokens_seen": 33669864, "step": 58360 }, { "epoch": 8.693029490616622, "grad_norm": 0.21130317449569702, "learning_rate": 2.5574537353376977e-06, "loss": 0.8128, "num_input_tokens_seen": 33672872, "step": 58365 }, { "epoch": 8.69377420315758, "grad_norm": 0.24205341935157776, "learning_rate": 2.5545910811443224e-06, "loss": 0.7907, "num_input_tokens_seen": 33675720, "step": 58370 }, { "epoch": 8.69451891569854, "grad_norm": 0.24001961946487427, "learning_rate": 2.5517299437204214e-06, "loss": 0.8112, "num_input_tokens_seen": 33678536, "step": 58375 }, { "epoch": 8.6952636282395, "grad_norm": 0.25222301483154297, "learning_rate": 2.5488703232593474e-06, "loss": 0.8091, "num_input_tokens_seen": 33681224, "step": 58380 }, { "epoch": 8.696008340780459, "grad_norm": 0.19635720551013947, "learning_rate": 2.5460122199543328e-06, "loss": 0.7844, "num_input_tokens_seen": 33683912, "step": 58385 }, { "epoch": 8.696753053321418, "grad_norm": 0.2230585366487503, "learning_rate": 2.54315563399852e-06, "loss": 0.834, "num_input_tokens_seen": 33687048, "step": 58390 }, { "epoch": 8.697497765862376, "grad_norm": 0.3226752281188965, "learning_rate": 2.5403005655849464e-06, "loss": 0.7847, "num_input_tokens_seen": 33690216, "step": 58395 }, { "epoch": 8.698242478403337, "grad_norm": 0.29210489988327026, "learning_rate": 2.5374470149065465e-06, "loss": 0.7851, "num_input_tokens_seen": 33693160, "step": 58400 }, { "epoch": 8.698987190944296, "grad_norm": 0.25307565927505493, "learning_rate": 2.5345949821561523e-06, "loss": 0.8007, "num_input_tokens_seen": 33696008, "step": 58405 }, { "epoch": 8.699731903485254, "grad_norm": 0.19180214405059814, "learning_rate": 2.5317444675264978e-06, "loss": 0.7817, "num_input_tokens_seen": 33699048, "step": 58410 }, { "epoch": 8.700476616026213, "grad_norm": 0.3082956373691559, "learning_rate": 2.528895471210199e-06, "loss": 0.8093, "num_input_tokens_seen": 33701832, "step": 58415 }, { "epoch": 8.701221328567174, "grad_norm": 0.2030867338180542, "learning_rate": 2.5260479933997826e-06, "loss": 0.8094, "num_input_tokens_seen": 33704552, "step": 58420 }, { "epoch": 8.701966041108133, "grad_norm": 0.23713752627372742, "learning_rate": 2.5232020342876666e-06, "loss": 0.7588, "num_input_tokens_seen": 33707176, "step": 58425 }, { "epoch": 8.702710753649091, "grad_norm": 0.20844747126102448, "learning_rate": 2.520357594066175e-06, "loss": 0.8221, "num_input_tokens_seen": 33710184, "step": 58430 }, { "epoch": 8.70345546619005, "grad_norm": 0.2114044576883316, "learning_rate": 2.5175146729275205e-06, "loss": 0.807, "num_input_tokens_seen": 33713128, "step": 58435 }, { "epoch": 8.70420017873101, "grad_norm": 0.24138902127742767, "learning_rate": 2.5146732710638192e-06, "loss": 0.8204, "num_input_tokens_seen": 33715720, "step": 58440 }, { "epoch": 8.70494489127197, "grad_norm": 0.234625905752182, "learning_rate": 2.511833388667084e-06, "loss": 0.7927, "num_input_tokens_seen": 33718440, "step": 58445 }, { "epoch": 8.705689603812928, "grad_norm": 0.19529367983341217, "learning_rate": 2.5089950259292173e-06, "loss": 0.7937, "num_input_tokens_seen": 33720968, "step": 58450 }, { "epoch": 8.706434316353887, "grad_norm": 0.2635915279388428, "learning_rate": 2.5061581830420207e-06, "loss": 0.8114, "num_input_tokens_seen": 33723752, "step": 58455 }, { "epoch": 8.707179028894847, "grad_norm": 0.17526639997959137, "learning_rate": 2.503322860197199e-06, "loss": 0.8023, "num_input_tokens_seen": 33726600, "step": 58460 }, { "epoch": 8.707923741435806, "grad_norm": 0.26331862807273865, "learning_rate": 2.5004890575863556e-06, "loss": 0.7739, "num_input_tokens_seen": 33729320, "step": 58465 }, { "epoch": 8.708668453976765, "grad_norm": 0.27843964099884033, "learning_rate": 2.497656775400986e-06, "loss": 0.7926, "num_input_tokens_seen": 33732488, "step": 58470 }, { "epoch": 8.709413166517724, "grad_norm": 0.29205551743507385, "learning_rate": 2.4948260138324827e-06, "loss": 0.7971, "num_input_tokens_seen": 33735432, "step": 58475 }, { "epoch": 8.710157879058684, "grad_norm": 0.20942138135433197, "learning_rate": 2.4919967730721414e-06, "loss": 0.8234, "num_input_tokens_seen": 33738216, "step": 58480 }, { "epoch": 8.710902591599643, "grad_norm": 0.20199181139469147, "learning_rate": 2.489169053311144e-06, "loss": 0.7873, "num_input_tokens_seen": 33741192, "step": 58485 }, { "epoch": 8.711647304140602, "grad_norm": 0.2045433670282364, "learning_rate": 2.486342854740584e-06, "loss": 0.7991, "num_input_tokens_seen": 33744040, "step": 58490 }, { "epoch": 8.71239201668156, "grad_norm": 0.17898453772068024, "learning_rate": 2.483518177551436e-06, "loss": 0.7722, "num_input_tokens_seen": 33746760, "step": 58495 }, { "epoch": 8.71313672922252, "grad_norm": 0.271049827337265, "learning_rate": 2.4806950219345842e-06, "loss": 0.8015, "num_input_tokens_seen": 33749864, "step": 58500 }, { "epoch": 8.71388144176348, "grad_norm": 0.2327551245689392, "learning_rate": 2.4778733880808036e-06, "loss": 0.7746, "num_input_tokens_seen": 33753032, "step": 58505 }, { "epoch": 8.714626154304439, "grad_norm": 0.3027409315109253, "learning_rate": 2.4750532761807748e-06, "loss": 0.774, "num_input_tokens_seen": 33756616, "step": 58510 }, { "epoch": 8.715370866845397, "grad_norm": 0.22931818664073944, "learning_rate": 2.472234686425068e-06, "loss": 0.7946, "num_input_tokens_seen": 33759368, "step": 58515 }, { "epoch": 8.716115579386356, "grad_norm": 0.2514652609825134, "learning_rate": 2.469417619004144e-06, "loss": 0.8073, "num_input_tokens_seen": 33762408, "step": 58520 }, { "epoch": 8.716860291927317, "grad_norm": 0.330524206161499, "learning_rate": 2.466602074108379e-06, "loss": 0.8039, "num_input_tokens_seen": 33765384, "step": 58525 }, { "epoch": 8.717605004468275, "grad_norm": 0.2559313178062439, "learning_rate": 2.4637880519280317e-06, "loss": 0.7943, "num_input_tokens_seen": 33768392, "step": 58530 }, { "epoch": 8.718349717009234, "grad_norm": 0.2080271691083908, "learning_rate": 2.4609755526532607e-06, "loss": 0.7856, "num_input_tokens_seen": 33771368, "step": 58535 }, { "epoch": 8.719094429550193, "grad_norm": 0.25755953788757324, "learning_rate": 2.4581645764741227e-06, "loss": 0.7874, "num_input_tokens_seen": 33774024, "step": 58540 }, { "epoch": 8.719839142091153, "grad_norm": 0.23117120563983917, "learning_rate": 2.455355123580583e-06, "loss": 0.7985, "num_input_tokens_seen": 33776904, "step": 58545 }, { "epoch": 8.720583854632112, "grad_norm": 0.15898320078849792, "learning_rate": 2.4525471941624746e-06, "loss": 0.7716, "num_input_tokens_seen": 33779976, "step": 58550 }, { "epoch": 8.721328567173071, "grad_norm": 0.18038073182106018, "learning_rate": 2.4497407884095575e-06, "loss": 0.7845, "num_input_tokens_seen": 33782856, "step": 58555 }, { "epoch": 8.72207327971403, "grad_norm": 0.1988763064146042, "learning_rate": 2.4469359065114743e-06, "loss": 0.7865, "num_input_tokens_seen": 33785640, "step": 58560 }, { "epoch": 8.72281799225499, "grad_norm": 0.28598424792289734, "learning_rate": 2.444132548657771e-06, "loss": 0.7991, "num_input_tokens_seen": 33788648, "step": 58565 }, { "epoch": 8.723562704795949, "grad_norm": 0.24189640581607819, "learning_rate": 2.4413307150378873e-06, "loss": 0.7755, "num_input_tokens_seen": 33791848, "step": 58570 }, { "epoch": 8.724307417336908, "grad_norm": 0.23373711109161377, "learning_rate": 2.4385304058411525e-06, "loss": 0.8288, "num_input_tokens_seen": 33794216, "step": 58575 }, { "epoch": 8.725052129877866, "grad_norm": 0.22748172283172607, "learning_rate": 2.4357316212568094e-06, "loss": 0.7797, "num_input_tokens_seen": 33797160, "step": 58580 }, { "epoch": 8.725796842418827, "grad_norm": 0.2516602873802185, "learning_rate": 2.432934361473979e-06, "loss": 0.8014, "num_input_tokens_seen": 33800168, "step": 58585 }, { "epoch": 8.726541554959786, "grad_norm": 0.24723778665065765, "learning_rate": 2.4301386266816938e-06, "loss": 0.7924, "num_input_tokens_seen": 33803048, "step": 58590 }, { "epoch": 8.727286267500745, "grad_norm": 0.17897392809391022, "learning_rate": 2.4273444170688774e-06, "loss": 0.7839, "num_input_tokens_seen": 33806184, "step": 58595 }, { "epoch": 8.728030980041703, "grad_norm": 0.1841212958097458, "learning_rate": 2.424551732824354e-06, "loss": 0.8128, "num_input_tokens_seen": 33808808, "step": 58600 }, { "epoch": 8.728775692582664, "grad_norm": 0.27245157957077026, "learning_rate": 2.421760574136836e-06, "loss": 0.8228, "num_input_tokens_seen": 33811752, "step": 58605 }, { "epoch": 8.729520405123623, "grad_norm": 0.20783473551273346, "learning_rate": 2.418970941194948e-06, "loss": 0.8034, "num_input_tokens_seen": 33814440, "step": 58610 }, { "epoch": 8.730265117664581, "grad_norm": 0.22208133339881897, "learning_rate": 2.4161828341871973e-06, "loss": 0.7991, "num_input_tokens_seen": 33817224, "step": 58615 }, { "epoch": 8.73100983020554, "grad_norm": 0.2191048115491867, "learning_rate": 2.4133962533019832e-06, "loss": 0.7765, "num_input_tokens_seen": 33820232, "step": 58620 }, { "epoch": 8.7317545427465, "grad_norm": 0.23890800774097443, "learning_rate": 2.410611198727622e-06, "loss": 0.794, "num_input_tokens_seen": 33822952, "step": 58625 }, { "epoch": 8.73249925528746, "grad_norm": 0.2406991720199585, "learning_rate": 2.4078276706523156e-06, "loss": 0.8402, "num_input_tokens_seen": 33825672, "step": 58630 }, { "epoch": 8.733243967828418, "grad_norm": 0.16697034239768982, "learning_rate": 2.405045669264161e-06, "loss": 0.8125, "num_input_tokens_seen": 33828296, "step": 58635 }, { "epoch": 8.733988680369377, "grad_norm": 0.19997981190681458, "learning_rate": 2.4022651947511548e-06, "loss": 0.7996, "num_input_tokens_seen": 33831080, "step": 58640 }, { "epoch": 8.734733392910336, "grad_norm": 0.18712039291858673, "learning_rate": 2.399486247301197e-06, "loss": 0.8231, "num_input_tokens_seen": 33833960, "step": 58645 }, { "epoch": 8.735478105451296, "grad_norm": 0.1820455938577652, "learning_rate": 2.3967088271020707e-06, "loss": 0.7948, "num_input_tokens_seen": 33836552, "step": 58650 }, { "epoch": 8.736222817992255, "grad_norm": 0.26562488079071045, "learning_rate": 2.3939329343414584e-06, "loss": 0.7961, "num_input_tokens_seen": 33839496, "step": 58655 }, { "epoch": 8.736967530533214, "grad_norm": 0.2088170200586319, "learning_rate": 2.39115856920695e-06, "loss": 0.7971, "num_input_tokens_seen": 33842440, "step": 58660 }, { "epoch": 8.737712243074174, "grad_norm": 0.210292786359787, "learning_rate": 2.388385731886025e-06, "loss": 0.8108, "num_input_tokens_seen": 33845384, "step": 58665 }, { "epoch": 8.738456955615133, "grad_norm": 0.3363810181617737, "learning_rate": 2.38561442256606e-06, "loss": 0.7868, "num_input_tokens_seen": 33848328, "step": 58670 }, { "epoch": 8.739201668156092, "grad_norm": 0.279146283864975, "learning_rate": 2.3828446414343288e-06, "loss": 0.7846, "num_input_tokens_seen": 33851144, "step": 58675 }, { "epoch": 8.73994638069705, "grad_norm": 0.26507431268692017, "learning_rate": 2.380076388678007e-06, "loss": 0.8102, "num_input_tokens_seen": 33853672, "step": 58680 }, { "epoch": 8.74069109323801, "grad_norm": 0.21488656103610992, "learning_rate": 2.377309664484151e-06, "loss": 0.8047, "num_input_tokens_seen": 33856648, "step": 58685 }, { "epoch": 8.74143580577897, "grad_norm": 0.2630336582660675, "learning_rate": 2.3745444690397302e-06, "loss": 0.8146, "num_input_tokens_seen": 33859688, "step": 58690 }, { "epoch": 8.742180518319929, "grad_norm": 0.36887043714523315, "learning_rate": 2.3717808025316118e-06, "loss": 0.8172, "num_input_tokens_seen": 33862408, "step": 58695 }, { "epoch": 8.742925230860887, "grad_norm": 0.1929619461297989, "learning_rate": 2.369018665146544e-06, "loss": 0.7821, "num_input_tokens_seen": 33865000, "step": 58700 }, { "epoch": 8.743669943401846, "grad_norm": 0.21341277658939362, "learning_rate": 2.36625805707118e-06, "loss": 0.8137, "num_input_tokens_seen": 33867848, "step": 58705 }, { "epoch": 8.744414655942807, "grad_norm": 0.27035823464393616, "learning_rate": 2.363498978492082e-06, "loss": 0.7882, "num_input_tokens_seen": 33871176, "step": 58710 }, { "epoch": 8.745159368483765, "grad_norm": 0.19902919232845306, "learning_rate": 2.3607414295956835e-06, "loss": 0.8029, "num_input_tokens_seen": 33873864, "step": 58715 }, { "epoch": 8.745904081024724, "grad_norm": 0.2293211966753006, "learning_rate": 2.357985410568336e-06, "loss": 0.8224, "num_input_tokens_seen": 33876936, "step": 58720 }, { "epoch": 8.746648793565683, "grad_norm": 0.2099861055612564, "learning_rate": 2.3552309215962796e-06, "loss": 0.7967, "num_input_tokens_seen": 33879560, "step": 58725 }, { "epoch": 8.747393506106643, "grad_norm": 0.20840150117874146, "learning_rate": 2.3524779628656484e-06, "loss": 0.8084, "num_input_tokens_seen": 33882472, "step": 58730 }, { "epoch": 8.748138218647602, "grad_norm": 0.1718328446149826, "learning_rate": 2.3497265345624824e-06, "loss": 0.7929, "num_input_tokens_seen": 33885256, "step": 58735 }, { "epoch": 8.748882931188561, "grad_norm": 0.17470872402191162, "learning_rate": 2.3469766368727053e-06, "loss": 0.7946, "num_input_tokens_seen": 33888136, "step": 58740 }, { "epoch": 8.74962764372952, "grad_norm": 0.20719093084335327, "learning_rate": 2.3442282699821515e-06, "loss": 0.7652, "num_input_tokens_seen": 33891048, "step": 58745 }, { "epoch": 8.75037235627048, "grad_norm": 0.20257413387298584, "learning_rate": 2.341481434076534e-06, "loss": 0.7799, "num_input_tokens_seen": 33893864, "step": 58750 }, { "epoch": 8.751117068811439, "grad_norm": 0.21012863516807556, "learning_rate": 2.338736129341479e-06, "loss": 0.8399, "num_input_tokens_seen": 33896904, "step": 58755 }, { "epoch": 8.751861781352398, "grad_norm": 0.30818435549736023, "learning_rate": 2.335992355962502e-06, "loss": 0.8237, "num_input_tokens_seen": 33900200, "step": 58760 }, { "epoch": 8.752606493893357, "grad_norm": 0.23720437288284302, "learning_rate": 2.3332501141250156e-06, "loss": 0.7949, "num_input_tokens_seen": 33902952, "step": 58765 }, { "epoch": 8.753351206434317, "grad_norm": 0.21507889032363892, "learning_rate": 2.3305094040143303e-06, "loss": 0.8074, "num_input_tokens_seen": 33905832, "step": 58770 }, { "epoch": 8.754095918975276, "grad_norm": 0.2739075720310211, "learning_rate": 2.3277702258156566e-06, "loss": 0.7938, "num_input_tokens_seen": 33908872, "step": 58775 }, { "epoch": 8.754840631516235, "grad_norm": 0.20356592535972595, "learning_rate": 2.3250325797140952e-06, "loss": 0.8017, "num_input_tokens_seen": 33911624, "step": 58780 }, { "epoch": 8.755585344057193, "grad_norm": 0.22026747465133667, "learning_rate": 2.3222964658946357e-06, "loss": 0.8014, "num_input_tokens_seen": 33914728, "step": 58785 }, { "epoch": 8.756330056598154, "grad_norm": 0.21329019963741302, "learning_rate": 2.319561884542179e-06, "loss": 0.797, "num_input_tokens_seen": 33917544, "step": 58790 }, { "epoch": 8.757074769139113, "grad_norm": 0.19205015897750854, "learning_rate": 2.3168288358415197e-06, "loss": 0.7997, "num_input_tokens_seen": 33920584, "step": 58795 }, { "epoch": 8.757819481680071, "grad_norm": 0.24330563843250275, "learning_rate": 2.314097319977343e-06, "loss": 0.7972, "num_input_tokens_seen": 33923336, "step": 58800 }, { "epoch": 8.75856419422103, "grad_norm": 0.29805171489715576, "learning_rate": 2.3113673371342378e-06, "loss": 0.8106, "num_input_tokens_seen": 33926824, "step": 58805 }, { "epoch": 8.75930890676199, "grad_norm": 0.3563644587993622, "learning_rate": 2.3086388874966865e-06, "loss": 0.8204, "num_input_tokens_seen": 33929896, "step": 58810 }, { "epoch": 8.76005361930295, "grad_norm": 0.22051671147346497, "learning_rate": 2.3059119712490613e-06, "loss": 0.7684, "num_input_tokens_seen": 33932968, "step": 58815 }, { "epoch": 8.760798331843908, "grad_norm": 0.1936446726322174, "learning_rate": 2.303186588575634e-06, "loss": 0.7912, "num_input_tokens_seen": 33935368, "step": 58820 }, { "epoch": 8.761543044384867, "grad_norm": 0.3324211537837982, "learning_rate": 2.3004627396605776e-06, "loss": 0.8373, "num_input_tokens_seen": 33938120, "step": 58825 }, { "epoch": 8.762287756925826, "grad_norm": 0.21246980130672455, "learning_rate": 2.2977404246879607e-06, "loss": 0.7869, "num_input_tokens_seen": 33940936, "step": 58830 }, { "epoch": 8.763032469466786, "grad_norm": 0.24078327417373657, "learning_rate": 2.2950196438417448e-06, "loss": 0.7889, "num_input_tokens_seen": 33944072, "step": 58835 }, { "epoch": 8.763777182007745, "grad_norm": 0.26798245310783386, "learning_rate": 2.292300397305791e-06, "loss": 0.801, "num_input_tokens_seen": 33946824, "step": 58840 }, { "epoch": 8.764521894548704, "grad_norm": 0.17910437285900116, "learning_rate": 2.289582685263858e-06, "loss": 0.7946, "num_input_tokens_seen": 33949800, "step": 58845 }, { "epoch": 8.765266607089664, "grad_norm": 0.2508542835712433, "learning_rate": 2.2868665078995878e-06, "loss": 0.8072, "num_input_tokens_seen": 33952648, "step": 58850 }, { "epoch": 8.766011319630623, "grad_norm": 0.21626637876033783, "learning_rate": 2.2841518653965388e-06, "loss": 0.7969, "num_input_tokens_seen": 33955560, "step": 58855 }, { "epoch": 8.766756032171582, "grad_norm": 0.19782525300979614, "learning_rate": 2.281438757938145e-06, "loss": 0.8046, "num_input_tokens_seen": 33958280, "step": 58860 }, { "epoch": 8.76750074471254, "grad_norm": 0.18377287685871124, "learning_rate": 2.2787271857077546e-06, "loss": 0.7667, "num_input_tokens_seen": 33961064, "step": 58865 }, { "epoch": 8.7682454572535, "grad_norm": 0.16510117053985596, "learning_rate": 2.276017148888604e-06, "loss": 0.7831, "num_input_tokens_seen": 33963912, "step": 58870 }, { "epoch": 8.76899016979446, "grad_norm": 0.30901965498924255, "learning_rate": 2.273308647663827e-06, "loss": 0.8038, "num_input_tokens_seen": 33966632, "step": 58875 }, { "epoch": 8.769734882335419, "grad_norm": 0.13426142930984497, "learning_rate": 2.27060168221645e-06, "loss": 0.7944, "num_input_tokens_seen": 33969416, "step": 58880 }, { "epoch": 8.770479594876377, "grad_norm": 0.19095806777477264, "learning_rate": 2.2678962527293986e-06, "loss": 0.8012, "num_input_tokens_seen": 33971944, "step": 58885 }, { "epoch": 8.771224307417336, "grad_norm": 0.24862438440322876, "learning_rate": 2.2651923593854985e-06, "loss": 0.7855, "num_input_tokens_seen": 33974504, "step": 58890 }, { "epoch": 8.771969019958297, "grad_norm": 0.2089969366788864, "learning_rate": 2.2624900023674678e-06, "loss": 0.7977, "num_input_tokens_seen": 33977416, "step": 58895 }, { "epoch": 8.772713732499255, "grad_norm": 0.25799673795700073, "learning_rate": 2.259789181857916e-06, "loss": 0.7979, "num_input_tokens_seen": 33980200, "step": 58900 }, { "epoch": 8.773458445040214, "grad_norm": 0.22681143879890442, "learning_rate": 2.2570898980393552e-06, "loss": 0.8055, "num_input_tokens_seen": 33982920, "step": 58905 }, { "epoch": 8.774203157581173, "grad_norm": 0.2094132900238037, "learning_rate": 2.254392151094198e-06, "loss": 0.7848, "num_input_tokens_seen": 33985800, "step": 58910 }, { "epoch": 8.774947870122134, "grad_norm": 0.21231473982334137, "learning_rate": 2.251695941204737e-06, "loss": 0.8339, "num_input_tokens_seen": 33988968, "step": 58915 }, { "epoch": 8.775692582663092, "grad_norm": 0.26617735624313354, "learning_rate": 2.2490012685531777e-06, "loss": 0.7935, "num_input_tokens_seen": 33992008, "step": 58920 }, { "epoch": 8.776437295204051, "grad_norm": 0.22708240151405334, "learning_rate": 2.246308133321612e-06, "loss": 0.8016, "num_input_tokens_seen": 33994856, "step": 58925 }, { "epoch": 8.77718200774501, "grad_norm": 0.22255179286003113, "learning_rate": 2.2436165356920335e-06, "loss": 0.8058, "num_input_tokens_seen": 33997736, "step": 58930 }, { "epoch": 8.77792672028597, "grad_norm": 0.30382606387138367, "learning_rate": 2.2409264758463363e-06, "loss": 0.7753, "num_input_tokens_seen": 34000648, "step": 58935 }, { "epoch": 8.778671432826929, "grad_norm": 0.20266805589199066, "learning_rate": 2.238237953966288e-06, "loss": 0.7906, "num_input_tokens_seen": 34003656, "step": 58940 }, { "epoch": 8.779416145367888, "grad_norm": 0.2468997985124588, "learning_rate": 2.2355509702335825e-06, "loss": 0.7891, "num_input_tokens_seen": 34006344, "step": 58945 }, { "epoch": 8.780160857908847, "grad_norm": 0.2342306226491928, "learning_rate": 2.2328655248297833e-06, "loss": 0.7855, "num_input_tokens_seen": 34009096, "step": 58950 }, { "epoch": 8.780905570449807, "grad_norm": 0.21091297268867493, "learning_rate": 2.2301816179363695e-06, "loss": 0.7776, "num_input_tokens_seen": 34012232, "step": 58955 }, { "epoch": 8.781650282990766, "grad_norm": 0.17451393604278564, "learning_rate": 2.2274992497347045e-06, "loss": 0.7997, "num_input_tokens_seen": 34015304, "step": 58960 }, { "epoch": 8.782394995531725, "grad_norm": 0.194550558924675, "learning_rate": 2.224818420406055e-06, "loss": 0.7733, "num_input_tokens_seen": 34018472, "step": 58965 }, { "epoch": 8.783139708072683, "grad_norm": 0.21627205610275269, "learning_rate": 2.2221391301315787e-06, "loss": 0.791, "num_input_tokens_seen": 34021224, "step": 58970 }, { "epoch": 8.783884420613644, "grad_norm": 0.25523605942726135, "learning_rate": 2.2194613790923387e-06, "loss": 0.804, "num_input_tokens_seen": 34024136, "step": 58975 }, { "epoch": 8.784629133154603, "grad_norm": 0.21932609379291534, "learning_rate": 2.2167851674692763e-06, "loss": 0.7775, "num_input_tokens_seen": 34026920, "step": 58980 }, { "epoch": 8.785373845695561, "grad_norm": 0.22139517962932587, "learning_rate": 2.214110495443242e-06, "loss": 0.7943, "num_input_tokens_seen": 34029896, "step": 58985 }, { "epoch": 8.78611855823652, "grad_norm": 0.23158837854862213, "learning_rate": 2.211437363194976e-06, "loss": 0.8155, "num_input_tokens_seen": 34032584, "step": 58990 }, { "epoch": 8.78686327077748, "grad_norm": 0.32867565751075745, "learning_rate": 2.2087657709051246e-06, "loss": 0.8111, "num_input_tokens_seen": 34035656, "step": 58995 }, { "epoch": 8.78760798331844, "grad_norm": 0.22682562470436096, "learning_rate": 2.206095718754217e-06, "loss": 0.7631, "num_input_tokens_seen": 34038536, "step": 59000 }, { "epoch": 8.788352695859398, "grad_norm": 0.3279826045036316, "learning_rate": 2.2034272069226897e-06, "loss": 0.8041, "num_input_tokens_seen": 34041640, "step": 59005 }, { "epoch": 8.789097408400357, "grad_norm": 0.20567253232002258, "learning_rate": 2.2007602355908707e-06, "loss": 0.8299, "num_input_tokens_seen": 34044488, "step": 59010 }, { "epoch": 8.789842120941316, "grad_norm": 0.20431166887283325, "learning_rate": 2.19809480493898e-06, "loss": 0.8067, "num_input_tokens_seen": 34047176, "step": 59015 }, { "epoch": 8.790586833482276, "grad_norm": 0.20842434465885162, "learning_rate": 2.195430915147134e-06, "loss": 0.7929, "num_input_tokens_seen": 34050152, "step": 59020 }, { "epoch": 8.791331546023235, "grad_norm": 0.2769920825958252, "learning_rate": 2.192768566395348e-06, "loss": 0.7602, "num_input_tokens_seen": 34053512, "step": 59025 }, { "epoch": 8.792076258564194, "grad_norm": 0.21547380089759827, "learning_rate": 2.1901077588635357e-06, "loss": 0.8023, "num_input_tokens_seen": 34056488, "step": 59030 }, { "epoch": 8.792820971105153, "grad_norm": 0.1822604387998581, "learning_rate": 2.187448492731503e-06, "loss": 0.8018, "num_input_tokens_seen": 34059272, "step": 59035 }, { "epoch": 8.793565683646113, "grad_norm": 0.22123081982135773, "learning_rate": 2.184790768178957e-06, "loss": 0.7861, "num_input_tokens_seen": 34062760, "step": 59040 }, { "epoch": 8.794310396187072, "grad_norm": 0.16314733028411865, "learning_rate": 2.182134585385487e-06, "loss": 0.8002, "num_input_tokens_seen": 34065704, "step": 59045 }, { "epoch": 8.79505510872803, "grad_norm": 0.22635263204574585, "learning_rate": 2.179479944530588e-06, "loss": 0.7967, "num_input_tokens_seen": 34068648, "step": 59050 }, { "epoch": 8.79579982126899, "grad_norm": 0.2464129626750946, "learning_rate": 2.1768268457936613e-06, "loss": 0.8113, "num_input_tokens_seen": 34071624, "step": 59055 }, { "epoch": 8.79654453380995, "grad_norm": 0.23131710290908813, "learning_rate": 2.1741752893539775e-06, "loss": 0.8119, "num_input_tokens_seen": 34074600, "step": 59060 }, { "epoch": 8.797289246350909, "grad_norm": 0.29050251841545105, "learning_rate": 2.1715252753907234e-06, "loss": 0.7941, "num_input_tokens_seen": 34077832, "step": 59065 }, { "epoch": 8.798033958891867, "grad_norm": 0.240230530500412, "learning_rate": 2.168876804082978e-06, "loss": 0.7858, "num_input_tokens_seen": 34080456, "step": 59070 }, { "epoch": 8.798778671432826, "grad_norm": 0.17576025426387787, "learning_rate": 2.166229875609718e-06, "loss": 0.7796, "num_input_tokens_seen": 34083336, "step": 59075 }, { "epoch": 8.799523383973787, "grad_norm": 0.253322958946228, "learning_rate": 2.163584490149806e-06, "loss": 0.7897, "num_input_tokens_seen": 34086088, "step": 59080 }, { "epoch": 8.800268096514746, "grad_norm": 0.20842304825782776, "learning_rate": 2.1609406478820066e-06, "loss": 0.79, "num_input_tokens_seen": 34089224, "step": 59085 }, { "epoch": 8.801012809055704, "grad_norm": 0.2397356480360031, "learning_rate": 2.15829834898498e-06, "loss": 0.8183, "num_input_tokens_seen": 34092168, "step": 59090 }, { "epoch": 8.801757521596663, "grad_norm": 0.1970713585615158, "learning_rate": 2.155657593637289e-06, "loss": 0.8299, "num_input_tokens_seen": 34094952, "step": 59095 }, { "epoch": 8.802502234137624, "grad_norm": 0.21516866981983185, "learning_rate": 2.1530183820173743e-06, "loss": 0.7865, "num_input_tokens_seen": 34097736, "step": 59100 }, { "epoch": 8.803246946678582, "grad_norm": 0.19043342769145966, "learning_rate": 2.1503807143035875e-06, "loss": 0.7648, "num_input_tokens_seen": 34100712, "step": 59105 }, { "epoch": 8.803991659219541, "grad_norm": 0.23268842697143555, "learning_rate": 2.1477445906741776e-06, "loss": 0.7812, "num_input_tokens_seen": 34103432, "step": 59110 }, { "epoch": 8.8047363717605, "grad_norm": 0.1912938952445984, "learning_rate": 2.1451100113072748e-06, "loss": 0.8005, "num_input_tokens_seen": 34106408, "step": 59115 }, { "epoch": 8.80548108430146, "grad_norm": 0.20488537847995758, "learning_rate": 2.142476976380914e-06, "loss": 0.789, "num_input_tokens_seen": 34109032, "step": 59120 }, { "epoch": 8.80622579684242, "grad_norm": 0.2220134288072586, "learning_rate": 2.1398454860730277e-06, "loss": 0.7911, "num_input_tokens_seen": 34111944, "step": 59125 }, { "epoch": 8.806970509383378, "grad_norm": 0.24050340056419373, "learning_rate": 2.1372155405614436e-06, "loss": 0.7858, "num_input_tokens_seen": 34114760, "step": 59130 }, { "epoch": 8.807715221924337, "grad_norm": 0.19603969156742096, "learning_rate": 2.13458714002388e-06, "loss": 0.787, "num_input_tokens_seen": 34117448, "step": 59135 }, { "epoch": 8.808459934465297, "grad_norm": 0.26330843567848206, "learning_rate": 2.1319602846379518e-06, "loss": 0.8004, "num_input_tokens_seen": 34120456, "step": 59140 }, { "epoch": 8.809204647006256, "grad_norm": 0.2355555295944214, "learning_rate": 2.1293349745811765e-06, "loss": 0.8123, "num_input_tokens_seen": 34123336, "step": 59145 }, { "epoch": 8.809949359547215, "grad_norm": 0.24618001282215118, "learning_rate": 2.1267112100309545e-06, "loss": 0.8171, "num_input_tokens_seen": 34126152, "step": 59150 }, { "epoch": 8.810694072088173, "grad_norm": 0.23649157583713531, "learning_rate": 2.1240889911645913e-06, "loss": 0.8195, "num_input_tokens_seen": 34129000, "step": 59155 }, { "epoch": 8.811438784629132, "grad_norm": 0.1824686974287033, "learning_rate": 2.121468318159289e-06, "loss": 0.784, "num_input_tokens_seen": 34132008, "step": 59160 }, { "epoch": 8.812183497170093, "grad_norm": 0.24196137487888336, "learning_rate": 2.1188491911921403e-06, "loss": 0.7568, "num_input_tokens_seen": 34134728, "step": 59165 }, { "epoch": 8.812928209711052, "grad_norm": 0.21905659139156342, "learning_rate": 2.1162316104401364e-06, "loss": 0.7968, "num_input_tokens_seen": 34137512, "step": 59170 }, { "epoch": 8.81367292225201, "grad_norm": 0.19312351942062378, "learning_rate": 2.1136155760801633e-06, "loss": 0.805, "num_input_tokens_seen": 34140328, "step": 59175 }, { "epoch": 8.81441763479297, "grad_norm": 0.2339327484369278, "learning_rate": 2.1110010882890025e-06, "loss": 0.7723, "num_input_tokens_seen": 34143336, "step": 59180 }, { "epoch": 8.81516234733393, "grad_norm": 0.21813753247261047, "learning_rate": 2.1083881472433232e-06, "loss": 0.8, "num_input_tokens_seen": 34146088, "step": 59185 }, { "epoch": 8.815907059874888, "grad_norm": 0.25661739706993103, "learning_rate": 2.105776753119701e-06, "loss": 0.8247, "num_input_tokens_seen": 34148840, "step": 59190 }, { "epoch": 8.816651772415847, "grad_norm": 0.36066025495529175, "learning_rate": 2.1031669060946056e-06, "loss": 0.8225, "num_input_tokens_seen": 34151944, "step": 59195 }, { "epoch": 8.817396484956806, "grad_norm": 0.2351033091545105, "learning_rate": 2.100558606344399e-06, "loss": 0.8209, "num_input_tokens_seen": 34155112, "step": 59200 }, { "epoch": 8.818141197497766, "grad_norm": 0.2215709686279297, "learning_rate": 2.0979518540453435e-06, "loss": 0.7836, "num_input_tokens_seen": 34157800, "step": 59205 }, { "epoch": 8.818885910038725, "grad_norm": 0.2243625670671463, "learning_rate": 2.095346649373586e-06, "loss": 0.7778, "num_input_tokens_seen": 34160680, "step": 59210 }, { "epoch": 8.819630622579684, "grad_norm": 0.2135906219482422, "learning_rate": 2.092742992505181e-06, "loss": 0.7733, "num_input_tokens_seen": 34163336, "step": 59215 }, { "epoch": 8.820375335120643, "grad_norm": 0.16852815449237823, "learning_rate": 2.090140883616068e-06, "loss": 0.7638, "num_input_tokens_seen": 34166120, "step": 59220 }, { "epoch": 8.821120047661603, "grad_norm": 0.23312892019748688, "learning_rate": 2.087540322882087e-06, "loss": 0.7781, "num_input_tokens_seen": 34169000, "step": 59225 }, { "epoch": 8.821864760202562, "grad_norm": 0.18294639885425568, "learning_rate": 2.084941310478977e-06, "loss": 0.7936, "num_input_tokens_seen": 34172008, "step": 59230 }, { "epoch": 8.82260947274352, "grad_norm": 0.19439247250556946, "learning_rate": 2.0823438465823656e-06, "loss": 0.786, "num_input_tokens_seen": 34174792, "step": 59235 }, { "epoch": 8.82335418528448, "grad_norm": 0.2745455503463745, "learning_rate": 2.079747931367787e-06, "loss": 0.8085, "num_input_tokens_seen": 34177576, "step": 59240 }, { "epoch": 8.82409889782544, "grad_norm": 0.24251490831375122, "learning_rate": 2.0771535650106533e-06, "loss": 0.8022, "num_input_tokens_seen": 34180488, "step": 59245 }, { "epoch": 8.824843610366399, "grad_norm": 0.3034747838973999, "learning_rate": 2.0745607476862826e-06, "loss": 0.8075, "num_input_tokens_seen": 34183176, "step": 59250 }, { "epoch": 8.825588322907358, "grad_norm": 0.22701971232891083, "learning_rate": 2.0719694795698907e-06, "loss": 0.7901, "num_input_tokens_seen": 34186024, "step": 59255 }, { "epoch": 8.826333035448316, "grad_norm": 0.20981764793395996, "learning_rate": 2.0693797608365817e-06, "loss": 0.7918, "num_input_tokens_seen": 34188904, "step": 59260 }, { "epoch": 8.827077747989277, "grad_norm": 0.17332342267036438, "learning_rate": 2.0667915916613573e-06, "loss": 0.8235, "num_input_tokens_seen": 34191464, "step": 59265 }, { "epoch": 8.827822460530236, "grad_norm": 0.2557583451271057, "learning_rate": 2.0642049722191193e-06, "loss": 0.8092, "num_input_tokens_seen": 34194184, "step": 59270 }, { "epoch": 8.828567173071194, "grad_norm": 0.21383380889892578, "learning_rate": 2.0616199026846613e-06, "loss": 0.7735, "num_input_tokens_seen": 34197160, "step": 59275 }, { "epoch": 8.829311885612153, "grad_norm": 0.31175726652145386, "learning_rate": 2.059036383232668e-06, "loss": 0.8184, "num_input_tokens_seen": 34200200, "step": 59280 }, { "epoch": 8.830056598153114, "grad_norm": 0.21925440430641174, "learning_rate": 2.0564544140377228e-06, "loss": 0.807, "num_input_tokens_seen": 34203368, "step": 59285 }, { "epoch": 8.830801310694072, "grad_norm": 0.1721455454826355, "learning_rate": 2.0538739952743054e-06, "loss": 0.7972, "num_input_tokens_seen": 34206024, "step": 59290 }, { "epoch": 8.831546023235031, "grad_norm": 0.2434229701757431, "learning_rate": 2.0512951271167922e-06, "loss": 0.8007, "num_input_tokens_seen": 34208744, "step": 59295 }, { "epoch": 8.83229073577599, "grad_norm": 0.15826132893562317, "learning_rate": 2.048717809739459e-06, "loss": 0.7962, "num_input_tokens_seen": 34211560, "step": 59300 }, { "epoch": 8.83303544831695, "grad_norm": 0.21840709447860718, "learning_rate": 2.046142043316457e-06, "loss": 0.7864, "num_input_tokens_seen": 34214312, "step": 59305 }, { "epoch": 8.83378016085791, "grad_norm": 0.32032716274261475, "learning_rate": 2.0435678280218556e-06, "loss": 0.7881, "num_input_tokens_seen": 34217064, "step": 59310 }, { "epoch": 8.834524873398868, "grad_norm": 0.25587889552116394, "learning_rate": 2.040995164029602e-06, "loss": 0.8135, "num_input_tokens_seen": 34219912, "step": 59315 }, { "epoch": 8.835269585939827, "grad_norm": 0.2829260230064392, "learning_rate": 2.038424051513549e-06, "loss": 0.8145, "num_input_tokens_seen": 34222728, "step": 59320 }, { "epoch": 8.836014298480787, "grad_norm": 0.2692870795726776, "learning_rate": 2.035854490647446e-06, "loss": 0.7918, "num_input_tokens_seen": 34225640, "step": 59325 }, { "epoch": 8.836759011021746, "grad_norm": 0.2688799202442169, "learning_rate": 2.033286481604932e-06, "loss": 0.8189, "num_input_tokens_seen": 34228392, "step": 59330 }, { "epoch": 8.837503723562705, "grad_norm": 0.3362376093864441, "learning_rate": 2.0307200245595403e-06, "loss": 0.7618, "num_input_tokens_seen": 34231592, "step": 59335 }, { "epoch": 8.838248436103664, "grad_norm": 0.29126983880996704, "learning_rate": 2.028155119684708e-06, "loss": 0.8166, "num_input_tokens_seen": 34234568, "step": 59340 }, { "epoch": 8.838993148644622, "grad_norm": 0.3129896819591522, "learning_rate": 2.0255917671537534e-06, "loss": 0.7661, "num_input_tokens_seen": 34237640, "step": 59345 }, { "epoch": 8.839737861185583, "grad_norm": 0.2096903920173645, "learning_rate": 2.0230299671399e-06, "loss": 0.799, "num_input_tokens_seen": 34240456, "step": 59350 }, { "epoch": 8.840482573726542, "grad_norm": 0.2935349941253662, "learning_rate": 2.0204697198162593e-06, "loss": 0.8278, "num_input_tokens_seen": 34243688, "step": 59355 }, { "epoch": 8.8412272862675, "grad_norm": 0.18214528262615204, "learning_rate": 2.0179110253558507e-06, "loss": 0.793, "num_input_tokens_seen": 34246600, "step": 59360 }, { "epoch": 8.84197199880846, "grad_norm": 0.24502204358577728, "learning_rate": 2.0153538839315756e-06, "loss": 0.7934, "num_input_tokens_seen": 34249384, "step": 59365 }, { "epoch": 8.84271671134942, "grad_norm": 0.23434516787528992, "learning_rate": 2.0127982957162395e-06, "loss": 0.8107, "num_input_tokens_seen": 34252232, "step": 59370 }, { "epoch": 8.843461423890378, "grad_norm": 0.24731026589870453, "learning_rate": 2.0102442608825324e-06, "loss": 0.7773, "num_input_tokens_seen": 34255080, "step": 59375 }, { "epoch": 8.844206136431337, "grad_norm": 0.21408389508724213, "learning_rate": 2.007691779603052e-06, "loss": 0.793, "num_input_tokens_seen": 34257960, "step": 59380 }, { "epoch": 8.844950848972296, "grad_norm": 0.20974019169807434, "learning_rate": 2.0051408520502774e-06, "loss": 0.814, "num_input_tokens_seen": 34260904, "step": 59385 }, { "epoch": 8.845695561513256, "grad_norm": 0.24481962621212006, "learning_rate": 2.0025914783965926e-06, "loss": 0.8025, "num_input_tokens_seen": 34263560, "step": 59390 }, { "epoch": 8.846440274054215, "grad_norm": 0.21390320360660553, "learning_rate": 2.000043658814277e-06, "loss": 0.8105, "num_input_tokens_seen": 34266536, "step": 59395 }, { "epoch": 8.847184986595174, "grad_norm": 0.19372938573360443, "learning_rate": 1.9974973934755003e-06, "loss": 0.7941, "num_input_tokens_seen": 34269160, "step": 59400 }, { "epoch": 8.847929699136133, "grad_norm": 0.37719178199768066, "learning_rate": 1.994952682552331e-06, "loss": 0.8193, "num_input_tokens_seen": 34272168, "step": 59405 }, { "epoch": 8.848674411677093, "grad_norm": 0.20568890869617462, "learning_rate": 1.9924095262167238e-06, "loss": 0.7684, "num_input_tokens_seen": 34274984, "step": 59410 }, { "epoch": 8.849419124218052, "grad_norm": 0.24205997586250305, "learning_rate": 1.9898679246405372e-06, "loss": 0.8389, "num_input_tokens_seen": 34277896, "step": 59415 }, { "epoch": 8.85016383675901, "grad_norm": 0.18346387147903442, "learning_rate": 1.9873278779955316e-06, "loss": 0.7935, "num_input_tokens_seen": 34280872, "step": 59420 }, { "epoch": 8.85090854929997, "grad_norm": 0.24874509871006012, "learning_rate": 1.9847893864533395e-06, "loss": 0.7801, "num_input_tokens_seen": 34283624, "step": 59425 }, { "epoch": 8.85165326184093, "grad_norm": 0.2050900012254715, "learning_rate": 1.9822524501855067e-06, "loss": 0.8073, "num_input_tokens_seen": 34286344, "step": 59430 }, { "epoch": 8.852397974381889, "grad_norm": 0.267008900642395, "learning_rate": 1.979717069363471e-06, "loss": 0.8081, "num_input_tokens_seen": 34289160, "step": 59435 }, { "epoch": 8.853142686922848, "grad_norm": 0.3070235848426819, "learning_rate": 1.9771832441585647e-06, "loss": 0.764, "num_input_tokens_seen": 34292104, "step": 59440 }, { "epoch": 8.853887399463806, "grad_norm": 0.217317596077919, "learning_rate": 1.9746509747420065e-06, "loss": 0.8066, "num_input_tokens_seen": 34294760, "step": 59445 }, { "epoch": 8.854632112004767, "grad_norm": 0.2518477737903595, "learning_rate": 1.972120261284924e-06, "loss": 0.8437, "num_input_tokens_seen": 34297736, "step": 59450 }, { "epoch": 8.855376824545726, "grad_norm": 0.20216797292232513, "learning_rate": 1.9695911039583265e-06, "loss": 0.7927, "num_input_tokens_seen": 34300488, "step": 59455 }, { "epoch": 8.856121537086684, "grad_norm": 0.1757963001728058, "learning_rate": 1.9670635029331336e-06, "loss": 0.8238, "num_input_tokens_seen": 34303432, "step": 59460 }, { "epoch": 8.856866249627643, "grad_norm": 0.21835291385650635, "learning_rate": 1.9645374583801417e-06, "loss": 0.7748, "num_input_tokens_seen": 34306152, "step": 59465 }, { "epoch": 8.857610962168604, "grad_norm": 0.18003281950950623, "learning_rate": 1.9620129704700506e-06, "loss": 0.8182, "num_input_tokens_seen": 34309192, "step": 59470 }, { "epoch": 8.858355674709562, "grad_norm": 0.32612550258636475, "learning_rate": 1.95949003937346e-06, "loss": 0.7837, "num_input_tokens_seen": 34311944, "step": 59475 }, { "epoch": 8.859100387250521, "grad_norm": 0.14248894155025482, "learning_rate": 1.9569686652608555e-06, "loss": 0.7902, "num_input_tokens_seen": 34314920, "step": 59480 }, { "epoch": 8.85984509979148, "grad_norm": 0.17921969294548035, "learning_rate": 1.9544488483026203e-06, "loss": 0.7741, "num_input_tokens_seen": 34317608, "step": 59485 }, { "epoch": 8.86058981233244, "grad_norm": 0.39365747570991516, "learning_rate": 1.9519305886690378e-06, "loss": 0.8057, "num_input_tokens_seen": 34320680, "step": 59490 }, { "epoch": 8.8613345248734, "grad_norm": 0.229779452085495, "learning_rate": 1.949413886530277e-06, "loss": 0.7601, "num_input_tokens_seen": 34323272, "step": 59495 }, { "epoch": 8.862079237414358, "grad_norm": 0.27867886424064636, "learning_rate": 1.9468987420564135e-06, "loss": 0.7785, "num_input_tokens_seen": 34326088, "step": 59500 }, { "epoch": 8.862823949955317, "grad_norm": 0.22802770137786865, "learning_rate": 1.9443851554174026e-06, "loss": 0.8304, "num_input_tokens_seen": 34328776, "step": 59505 }, { "epoch": 8.863568662496277, "grad_norm": 0.16889353096485138, "learning_rate": 1.9418731267831088e-06, "loss": 0.8116, "num_input_tokens_seen": 34331368, "step": 59510 }, { "epoch": 8.864313375037236, "grad_norm": 0.21937014162540436, "learning_rate": 1.939362656323279e-06, "loss": 0.8174, "num_input_tokens_seen": 34334120, "step": 59515 }, { "epoch": 8.865058087578195, "grad_norm": 0.20455697178840637, "learning_rate": 1.936853744207562e-06, "loss": 0.8003, "num_input_tokens_seen": 34336872, "step": 59520 }, { "epoch": 8.865802800119154, "grad_norm": 0.29872676730155945, "learning_rate": 1.9343463906055017e-06, "loss": 0.793, "num_input_tokens_seen": 34340200, "step": 59525 }, { "epoch": 8.866547512660112, "grad_norm": 0.2049122005701065, "learning_rate": 1.931840595686535e-06, "loss": 0.7882, "num_input_tokens_seen": 34342920, "step": 59530 }, { "epoch": 8.867292225201073, "grad_norm": 0.24744610488414764, "learning_rate": 1.929336359619996e-06, "loss": 0.7992, "num_input_tokens_seen": 34345704, "step": 59535 }, { "epoch": 8.868036937742032, "grad_norm": 0.2431274950504303, "learning_rate": 1.9268336825751022e-06, "loss": 0.8174, "num_input_tokens_seen": 34348712, "step": 59540 }, { "epoch": 8.86878165028299, "grad_norm": 0.21202296018600464, "learning_rate": 1.9243325647209846e-06, "loss": 0.7932, "num_input_tokens_seen": 34351528, "step": 59545 }, { "epoch": 8.86952636282395, "grad_norm": 0.23014633357524872, "learning_rate": 1.9218330062266474e-06, "loss": 0.8106, "num_input_tokens_seen": 34354248, "step": 59550 }, { "epoch": 8.87027107536491, "grad_norm": 0.20183388888835907, "learning_rate": 1.919335007261008e-06, "loss": 0.8, "num_input_tokens_seen": 34356872, "step": 59555 }, { "epoch": 8.871015787905868, "grad_norm": 0.2808038890361786, "learning_rate": 1.9168385679928707e-06, "loss": 0.7915, "num_input_tokens_seen": 34359496, "step": 59560 }, { "epoch": 8.871760500446827, "grad_norm": 0.246503084897995, "learning_rate": 1.914343688590933e-06, "loss": 0.7987, "num_input_tokens_seen": 34362408, "step": 59565 }, { "epoch": 8.872505212987786, "grad_norm": 0.19405686855316162, "learning_rate": 1.9118503692237917e-06, "loss": 0.8014, "num_input_tokens_seen": 34364968, "step": 59570 }, { "epoch": 8.873249925528746, "grad_norm": 0.19820745289325714, "learning_rate": 1.9093586100599304e-06, "loss": 0.7998, "num_input_tokens_seen": 34367752, "step": 59575 }, { "epoch": 8.873994638069705, "grad_norm": 0.2401808500289917, "learning_rate": 1.90686841126774e-06, "loss": 0.8003, "num_input_tokens_seen": 34370632, "step": 59580 }, { "epoch": 8.874739350610664, "grad_norm": 0.13847722113132477, "learning_rate": 1.9043797730154856e-06, "loss": 0.8022, "num_input_tokens_seen": 34373224, "step": 59585 }, { "epoch": 8.875484063151623, "grad_norm": 0.1968056857585907, "learning_rate": 1.9018926954713495e-06, "loss": 0.778, "num_input_tokens_seen": 34376008, "step": 59590 }, { "epoch": 8.876228775692583, "grad_norm": 0.26755911111831665, "learning_rate": 1.8994071788033919e-06, "loss": 0.8004, "num_input_tokens_seen": 34379048, "step": 59595 }, { "epoch": 8.876973488233542, "grad_norm": 0.24434058368206024, "learning_rate": 1.896923223179578e-06, "loss": 0.8078, "num_input_tokens_seen": 34382088, "step": 59600 }, { "epoch": 8.8777182007745, "grad_norm": 0.13660340011119843, "learning_rate": 1.8944408287677683e-06, "loss": 0.7947, "num_input_tokens_seen": 34384840, "step": 59605 }, { "epoch": 8.87846291331546, "grad_norm": 0.21968872845172882, "learning_rate": 1.891959995735701e-06, "loss": 0.8137, "num_input_tokens_seen": 34387848, "step": 59610 }, { "epoch": 8.87920762585642, "grad_norm": 0.215078204870224, "learning_rate": 1.8894807242510248e-06, "loss": 0.8101, "num_input_tokens_seen": 34390888, "step": 59615 }, { "epoch": 8.879952338397379, "grad_norm": 0.25668448209762573, "learning_rate": 1.8870030144812894e-06, "loss": 0.8009, "num_input_tokens_seen": 34394120, "step": 59620 }, { "epoch": 8.880697050938338, "grad_norm": 0.2529030740261078, "learning_rate": 1.8845268665939109e-06, "loss": 0.8231, "num_input_tokens_seen": 34396936, "step": 59625 }, { "epoch": 8.881441763479296, "grad_norm": 0.21368783712387085, "learning_rate": 1.8820522807562302e-06, "loss": 0.7629, "num_input_tokens_seen": 34399560, "step": 59630 }, { "epoch": 8.882186476020257, "grad_norm": 0.16800859570503235, "learning_rate": 1.8795792571354637e-06, "loss": 0.7713, "num_input_tokens_seen": 34402472, "step": 59635 }, { "epoch": 8.882931188561216, "grad_norm": 0.16628959774971008, "learning_rate": 1.8771077958987333e-06, "loss": 0.8084, "num_input_tokens_seen": 34405352, "step": 59640 }, { "epoch": 8.883675901102174, "grad_norm": 0.2556767761707306, "learning_rate": 1.874637897213044e-06, "loss": 0.7853, "num_input_tokens_seen": 34408168, "step": 59645 }, { "epoch": 8.884420613643133, "grad_norm": 0.2349443882703781, "learning_rate": 1.8721695612453072e-06, "loss": 0.8176, "num_input_tokens_seen": 34411016, "step": 59650 }, { "epoch": 8.885165326184094, "grad_norm": 0.21134337782859802, "learning_rate": 1.869702788162317e-06, "loss": 0.8404, "num_input_tokens_seen": 34413704, "step": 59655 }, { "epoch": 8.885910038725052, "grad_norm": 0.24669045209884644, "learning_rate": 1.8672375781307787e-06, "loss": 0.7932, "num_input_tokens_seen": 34416616, "step": 59660 }, { "epoch": 8.886654751266011, "grad_norm": 0.2328200489282608, "learning_rate": 1.864773931317268e-06, "loss": 0.7932, "num_input_tokens_seen": 34419432, "step": 59665 }, { "epoch": 8.88739946380697, "grad_norm": 0.19225069880485535, "learning_rate": 1.8623118478882733e-06, "loss": 0.8136, "num_input_tokens_seen": 34422088, "step": 59670 }, { "epoch": 8.88814417634793, "grad_norm": 0.17000854015350342, "learning_rate": 1.8598513280101786e-06, "loss": 0.7955, "num_input_tokens_seen": 34424648, "step": 59675 }, { "epoch": 8.88888888888889, "grad_norm": 0.20263370871543884, "learning_rate": 1.8573923718492454e-06, "loss": 0.7966, "num_input_tokens_seen": 34427432, "step": 59680 }, { "epoch": 8.889633601429848, "grad_norm": 0.24938617646694183, "learning_rate": 1.854934979571643e-06, "loss": 0.8002, "num_input_tokens_seen": 34430280, "step": 59685 }, { "epoch": 8.890378313970807, "grad_norm": 0.2393629550933838, "learning_rate": 1.8524791513434364e-06, "loss": 0.7987, "num_input_tokens_seen": 34433000, "step": 59690 }, { "epoch": 8.891123026511767, "grad_norm": 0.20207926630973816, "learning_rate": 1.8500248873305758e-06, "loss": 0.7988, "num_input_tokens_seen": 34435944, "step": 59695 }, { "epoch": 8.891867739052726, "grad_norm": 0.21865415573120117, "learning_rate": 1.8475721876989177e-06, "loss": 0.8133, "num_input_tokens_seen": 34438824, "step": 59700 }, { "epoch": 8.892612451593685, "grad_norm": 0.17608174681663513, "learning_rate": 1.845121052614196e-06, "loss": 0.8085, "num_input_tokens_seen": 34441640, "step": 59705 }, { "epoch": 8.893357164134644, "grad_norm": 0.17991890013217926, "learning_rate": 1.842671482242056e-06, "loss": 0.7931, "num_input_tokens_seen": 34444328, "step": 59710 }, { "epoch": 8.894101876675602, "grad_norm": 0.3028895854949951, "learning_rate": 1.8402234767480237e-06, "loss": 0.8226, "num_input_tokens_seen": 34447272, "step": 59715 }, { "epoch": 8.894846589216563, "grad_norm": 0.20466461777687073, "learning_rate": 1.8377770362975277e-06, "loss": 0.8242, "num_input_tokens_seen": 34449832, "step": 59720 }, { "epoch": 8.895591301757522, "grad_norm": 0.16485078632831573, "learning_rate": 1.835332161055886e-06, "loss": 0.7737, "num_input_tokens_seen": 34452872, "step": 59725 }, { "epoch": 8.89633601429848, "grad_norm": 0.19976934790611267, "learning_rate": 1.832888851188319e-06, "loss": 0.7873, "num_input_tokens_seen": 34455624, "step": 59730 }, { "epoch": 8.89708072683944, "grad_norm": 0.22375747561454773, "learning_rate": 1.8304471068599365e-06, "loss": 0.8198, "num_input_tokens_seen": 34458568, "step": 59735 }, { "epoch": 8.8978254393804, "grad_norm": 0.19641521573066711, "learning_rate": 1.8280069282357342e-06, "loss": 0.803, "num_input_tokens_seen": 34461352, "step": 59740 }, { "epoch": 8.898570151921358, "grad_norm": 0.2855178713798523, "learning_rate": 1.8255683154806163e-06, "loss": 0.799, "num_input_tokens_seen": 34464232, "step": 59745 }, { "epoch": 8.899314864462317, "grad_norm": 0.22071020305156708, "learning_rate": 1.8231312687593677e-06, "loss": 0.842, "num_input_tokens_seen": 34467016, "step": 59750 }, { "epoch": 8.900059577003276, "grad_norm": 0.23947609961032867, "learning_rate": 1.8206957882366788e-06, "loss": 0.788, "num_input_tokens_seen": 34469608, "step": 59755 }, { "epoch": 8.900804289544237, "grad_norm": 0.18917159736156464, "learning_rate": 1.818261874077129e-06, "loss": 0.8086, "num_input_tokens_seen": 34472648, "step": 59760 }, { "epoch": 8.901549002085195, "grad_norm": 0.2674722373485565, "learning_rate": 1.8158295264451897e-06, "loss": 0.8002, "num_input_tokens_seen": 34475752, "step": 59765 }, { "epoch": 8.902293714626154, "grad_norm": 0.1891024112701416, "learning_rate": 1.813398745505235e-06, "loss": 0.798, "num_input_tokens_seen": 34478792, "step": 59770 }, { "epoch": 8.903038427167113, "grad_norm": 0.25645712018013, "learning_rate": 1.8109695314215192e-06, "loss": 0.7693, "num_input_tokens_seen": 34481800, "step": 59775 }, { "epoch": 8.903783139708073, "grad_norm": 0.27415931224823, "learning_rate": 1.8085418843582086e-06, "loss": 0.8068, "num_input_tokens_seen": 34484840, "step": 59780 }, { "epoch": 8.904527852249032, "grad_norm": 0.20171724259853363, "learning_rate": 1.8061158044793413e-06, "loss": 0.8003, "num_input_tokens_seen": 34487720, "step": 59785 }, { "epoch": 8.90527256478999, "grad_norm": 0.20951540768146515, "learning_rate": 1.8036912919488697e-06, "loss": 0.8368, "num_input_tokens_seen": 34490760, "step": 59790 }, { "epoch": 8.90601727733095, "grad_norm": 0.21610434353351593, "learning_rate": 1.8012683469306319e-06, "loss": 0.7942, "num_input_tokens_seen": 34493608, "step": 59795 }, { "epoch": 8.90676198987191, "grad_norm": 0.33791860938072205, "learning_rate": 1.798846969588358e-06, "loss": 0.7971, "num_input_tokens_seen": 34496232, "step": 59800 }, { "epoch": 8.907506702412869, "grad_norm": 0.20229442417621613, "learning_rate": 1.7964271600856813e-06, "loss": 0.7731, "num_input_tokens_seen": 34499080, "step": 59805 }, { "epoch": 8.908251414953828, "grad_norm": 0.2568398714065552, "learning_rate": 1.7940089185861153e-06, "loss": 0.8124, "num_input_tokens_seen": 34502088, "step": 59810 }, { "epoch": 8.908996127494786, "grad_norm": 0.22017045319080353, "learning_rate": 1.7915922452530793e-06, "loss": 0.7851, "num_input_tokens_seen": 34505320, "step": 59815 }, { "epoch": 8.909740840035747, "grad_norm": 0.2631540596485138, "learning_rate": 1.7891771402498813e-06, "loss": 0.8043, "num_input_tokens_seen": 34508328, "step": 59820 }, { "epoch": 8.910485552576706, "grad_norm": 0.18562540411949158, "learning_rate": 1.7867636037397244e-06, "loss": 0.8105, "num_input_tokens_seen": 34511080, "step": 59825 }, { "epoch": 8.911230265117664, "grad_norm": 0.2159562110900879, "learning_rate": 1.7843516358857004e-06, "loss": 0.8195, "num_input_tokens_seen": 34513800, "step": 59830 }, { "epoch": 8.911974977658623, "grad_norm": 0.15836027264595032, "learning_rate": 1.7819412368508064e-06, "loss": 0.7979, "num_input_tokens_seen": 34516552, "step": 59835 }, { "epoch": 8.912719690199584, "grad_norm": 0.27890610694885254, "learning_rate": 1.7795324067979318e-06, "loss": 0.7754, "num_input_tokens_seen": 34519272, "step": 59840 }, { "epoch": 8.913464402740543, "grad_norm": 0.2345307320356369, "learning_rate": 1.7771251458898436e-06, "loss": 0.7824, "num_input_tokens_seen": 34522248, "step": 59845 }, { "epoch": 8.914209115281501, "grad_norm": 0.2381429374217987, "learning_rate": 1.7747194542892226e-06, "loss": 0.7874, "num_input_tokens_seen": 34525320, "step": 59850 }, { "epoch": 8.91495382782246, "grad_norm": 0.2351573258638382, "learning_rate": 1.7723153321586305e-06, "loss": 0.8036, "num_input_tokens_seen": 34528232, "step": 59855 }, { "epoch": 8.915698540363419, "grad_norm": 0.16300173103809357, "learning_rate": 1.7699127796605348e-06, "loss": 0.7831, "num_input_tokens_seen": 34531368, "step": 59860 }, { "epoch": 8.91644325290438, "grad_norm": 0.19328109920024872, "learning_rate": 1.7675117969572885e-06, "loss": 0.7783, "num_input_tokens_seen": 34534280, "step": 59865 }, { "epoch": 8.917187965445338, "grad_norm": 0.2458595484495163, "learning_rate": 1.7651123842111372e-06, "loss": 0.7791, "num_input_tokens_seen": 34537480, "step": 59870 }, { "epoch": 8.917932677986297, "grad_norm": 0.1807785928249359, "learning_rate": 1.7627145415842261e-06, "loss": 0.7793, "num_input_tokens_seen": 34540296, "step": 59875 }, { "epoch": 8.918677390527257, "grad_norm": 0.25281259417533875, "learning_rate": 1.7603182692385867e-06, "loss": 0.792, "num_input_tokens_seen": 34543048, "step": 59880 }, { "epoch": 8.919422103068216, "grad_norm": 0.4301159977912903, "learning_rate": 1.7579235673361533e-06, "loss": 0.789, "num_input_tokens_seen": 34546312, "step": 59885 }, { "epoch": 8.920166815609175, "grad_norm": 0.23023830354213715, "learning_rate": 1.755530436038752e-06, "loss": 0.7967, "num_input_tokens_seen": 34549064, "step": 59890 }, { "epoch": 8.920911528150134, "grad_norm": 0.29549744725227356, "learning_rate": 1.7531388755080951e-06, "loss": 0.7785, "num_input_tokens_seen": 34553096, "step": 59895 }, { "epoch": 8.921656240691092, "grad_norm": 0.20203758776187897, "learning_rate": 1.7507488859058035e-06, "loss": 0.8156, "num_input_tokens_seen": 34556008, "step": 59900 }, { "epoch": 8.922400953232053, "grad_norm": 0.2636222243309021, "learning_rate": 1.7483604673933756e-06, "loss": 0.7741, "num_input_tokens_seen": 34558856, "step": 59905 }, { "epoch": 8.923145665773012, "grad_norm": 0.25694015622138977, "learning_rate": 1.7459736201322158e-06, "loss": 0.7672, "num_input_tokens_seen": 34561512, "step": 59910 }, { "epoch": 8.92389037831397, "grad_norm": 0.4401426613330841, "learning_rate": 1.7435883442836086e-06, "loss": 0.8162, "num_input_tokens_seen": 34564648, "step": 59915 }, { "epoch": 8.92463509085493, "grad_norm": 0.17048607766628265, "learning_rate": 1.7412046400087505e-06, "loss": 0.7977, "num_input_tokens_seen": 34567432, "step": 59920 }, { "epoch": 8.92537980339589, "grad_norm": 0.17792749404907227, "learning_rate": 1.7388225074687182e-06, "loss": 0.8095, "num_input_tokens_seen": 34570152, "step": 59925 }, { "epoch": 8.926124515936849, "grad_norm": 0.1863105297088623, "learning_rate": 1.736441946824488e-06, "loss": 0.7832, "num_input_tokens_seen": 34572840, "step": 59930 }, { "epoch": 8.926869228477807, "grad_norm": 0.24773068726062775, "learning_rate": 1.7340629582369316e-06, "loss": 0.7928, "num_input_tokens_seen": 34575880, "step": 59935 }, { "epoch": 8.927613941018766, "grad_norm": 0.3473769724369049, "learning_rate": 1.7316855418668038e-06, "loss": 0.7968, "num_input_tokens_seen": 34579176, "step": 59940 }, { "epoch": 8.928358653559727, "grad_norm": 0.24345803260803223, "learning_rate": 1.7293096978747703e-06, "loss": 0.8312, "num_input_tokens_seen": 34581864, "step": 59945 }, { "epoch": 8.929103366100685, "grad_norm": 0.22844435274600983, "learning_rate": 1.7269354264213694e-06, "loss": 0.8081, "num_input_tokens_seen": 34584840, "step": 59950 }, { "epoch": 8.929848078641644, "grad_norm": 0.2817437946796417, "learning_rate": 1.7245627276670535e-06, "loss": 0.8206, "num_input_tokens_seen": 34587816, "step": 59955 }, { "epoch": 8.930592791182603, "grad_norm": 0.23737874627113342, "learning_rate": 1.722191601772158e-06, "loss": 0.8169, "num_input_tokens_seen": 34590792, "step": 59960 }, { "epoch": 8.931337503723563, "grad_norm": 0.2294079065322876, "learning_rate": 1.7198220488969102e-06, "loss": 0.7769, "num_input_tokens_seen": 34593640, "step": 59965 }, { "epoch": 8.932082216264522, "grad_norm": 0.18964126706123352, "learning_rate": 1.7174540692014435e-06, "loss": 0.7938, "num_input_tokens_seen": 34596232, "step": 59970 }, { "epoch": 8.932826928805481, "grad_norm": 0.2530345320701599, "learning_rate": 1.7150876628457686e-06, "loss": 0.7999, "num_input_tokens_seen": 34598792, "step": 59975 }, { "epoch": 8.93357164134644, "grad_norm": 0.3142586648464203, "learning_rate": 1.7127228299897991e-06, "loss": 0.8143, "num_input_tokens_seen": 34601704, "step": 59980 }, { "epoch": 8.9343163538874, "grad_norm": 0.2433595359325409, "learning_rate": 1.7103595707933434e-06, "loss": 0.802, "num_input_tokens_seen": 34604392, "step": 59985 }, { "epoch": 8.935061066428359, "grad_norm": 0.20733565092086792, "learning_rate": 1.707997885416096e-06, "loss": 0.8256, "num_input_tokens_seen": 34607080, "step": 59990 }, { "epoch": 8.935805778969318, "grad_norm": 0.20789512991905212, "learning_rate": 1.7056377740176543e-06, "loss": 0.8058, "num_input_tokens_seen": 34609928, "step": 59995 }, { "epoch": 8.936550491510276, "grad_norm": 0.24474556744098663, "learning_rate": 1.7032792367575047e-06, "loss": 0.7943, "num_input_tokens_seen": 34612712, "step": 60000 }, { "epoch": 8.937295204051237, "grad_norm": 0.25264474749565125, "learning_rate": 1.7009222737950276e-06, "loss": 0.7883, "num_input_tokens_seen": 34615464, "step": 60005 }, { "epoch": 8.938039916592196, "grad_norm": 0.19754265248775482, "learning_rate": 1.698566885289496e-06, "loss": 0.7941, "num_input_tokens_seen": 34618248, "step": 60010 }, { "epoch": 8.938784629133155, "grad_norm": 0.17173722386360168, "learning_rate": 1.696213071400074e-06, "loss": 0.821, "num_input_tokens_seen": 34621096, "step": 60015 }, { "epoch": 8.939529341674113, "grad_norm": 0.20893928408622742, "learning_rate": 1.693860832285829e-06, "loss": 0.7615, "num_input_tokens_seen": 34624296, "step": 60020 }, { "epoch": 8.940274054215074, "grad_norm": 0.2457793951034546, "learning_rate": 1.6915101681057144e-06, "loss": 0.7855, "num_input_tokens_seen": 34627048, "step": 60025 }, { "epoch": 8.941018766756033, "grad_norm": 0.2330169528722763, "learning_rate": 1.6891610790185752e-06, "loss": 0.7923, "num_input_tokens_seen": 34629928, "step": 60030 }, { "epoch": 8.941763479296991, "grad_norm": 0.25090354681015015, "learning_rate": 1.686813565183154e-06, "loss": 0.8375, "num_input_tokens_seen": 34633224, "step": 60035 }, { "epoch": 8.94250819183795, "grad_norm": 0.3034534752368927, "learning_rate": 1.6844676267580932e-06, "loss": 0.8065, "num_input_tokens_seen": 34636200, "step": 60040 }, { "epoch": 8.943252904378909, "grad_norm": 0.3675200343132019, "learning_rate": 1.6821232639019107e-06, "loss": 0.8425, "num_input_tokens_seen": 34639208, "step": 60045 }, { "epoch": 8.94399761691987, "grad_norm": 0.22082428634166718, "learning_rate": 1.6797804767730352e-06, "loss": 0.8082, "num_input_tokens_seen": 34641928, "step": 60050 }, { "epoch": 8.944742329460828, "grad_norm": 0.22669868171215057, "learning_rate": 1.6774392655297817e-06, "loss": 0.8224, "num_input_tokens_seen": 34644616, "step": 60055 }, { "epoch": 8.945487042001787, "grad_norm": 0.24652878940105438, "learning_rate": 1.6750996303303596e-06, "loss": 0.7951, "num_input_tokens_seen": 34647656, "step": 60060 }, { "epoch": 8.946231754542747, "grad_norm": 0.2988625168800354, "learning_rate": 1.6727615713328788e-06, "loss": 0.8082, "num_input_tokens_seen": 34650664, "step": 60065 }, { "epoch": 8.946976467083706, "grad_norm": 0.19041498005390167, "learning_rate": 1.670425088695321e-06, "loss": 0.8256, "num_input_tokens_seen": 34653608, "step": 60070 }, { "epoch": 8.947721179624665, "grad_norm": 0.2506144940853119, "learning_rate": 1.6680901825755908e-06, "loss": 0.7974, "num_input_tokens_seen": 34656584, "step": 60075 }, { "epoch": 8.948465892165624, "grad_norm": 0.28611794114112854, "learning_rate": 1.6657568531314615e-06, "loss": 0.7939, "num_input_tokens_seen": 34659208, "step": 60080 }, { "epoch": 8.949210604706582, "grad_norm": 0.19426506757736206, "learning_rate": 1.663425100520616e-06, "loss": 0.8026, "num_input_tokens_seen": 34661864, "step": 60085 }, { "epoch": 8.949955317247543, "grad_norm": 0.2745800018310547, "learning_rate": 1.661094924900619e-06, "loss": 0.783, "num_input_tokens_seen": 34665000, "step": 60090 }, { "epoch": 8.950700029788502, "grad_norm": 0.24287088215351105, "learning_rate": 1.65876632642894e-06, "loss": 0.8115, "num_input_tokens_seen": 34667848, "step": 60095 }, { "epoch": 8.95144474232946, "grad_norm": 0.24058060348033905, "learning_rate": 1.6564393052629384e-06, "loss": 0.7831, "num_input_tokens_seen": 34670792, "step": 60100 }, { "epoch": 8.95218945487042, "grad_norm": 0.21127356588840485, "learning_rate": 1.6541138615598585e-06, "loss": 0.7962, "num_input_tokens_seen": 34673416, "step": 60105 }, { "epoch": 8.95293416741138, "grad_norm": 0.22512443363666534, "learning_rate": 1.6517899954768434e-06, "loss": 0.793, "num_input_tokens_seen": 34676232, "step": 60110 }, { "epoch": 8.953678879952339, "grad_norm": 0.43296390771865845, "learning_rate": 1.6494677071709347e-06, "loss": 0.7861, "num_input_tokens_seen": 34679048, "step": 60115 }, { "epoch": 8.954423592493297, "grad_norm": 0.27524203062057495, "learning_rate": 1.6471469967990622e-06, "loss": 0.7839, "num_input_tokens_seen": 34681992, "step": 60120 }, { "epoch": 8.955168305034256, "grad_norm": 0.20164847373962402, "learning_rate": 1.6448278645180477e-06, "loss": 0.8297, "num_input_tokens_seen": 34684776, "step": 60125 }, { "epoch": 8.955913017575217, "grad_norm": 0.22089943289756775, "learning_rate": 1.6425103104846128e-06, "loss": 0.8035, "num_input_tokens_seen": 34687528, "step": 60130 }, { "epoch": 8.956657730116175, "grad_norm": 0.2536613941192627, "learning_rate": 1.6401943348553688e-06, "loss": 0.7815, "num_input_tokens_seen": 34690600, "step": 60135 }, { "epoch": 8.957402442657134, "grad_norm": 0.26816654205322266, "learning_rate": 1.6378799377868155e-06, "loss": 0.7746, "num_input_tokens_seen": 34693448, "step": 60140 }, { "epoch": 8.958147155198093, "grad_norm": 0.22939638793468475, "learning_rate": 1.635567119435355e-06, "loss": 0.8018, "num_input_tokens_seen": 34696520, "step": 60145 }, { "epoch": 8.958891867739053, "grad_norm": 0.21077044308185577, "learning_rate": 1.6332558799572711e-06, "loss": 0.7858, "num_input_tokens_seen": 34699240, "step": 60150 }, { "epoch": 8.959636580280012, "grad_norm": 0.27578431367874146, "learning_rate": 1.6309462195087555e-06, "loss": 0.8375, "num_input_tokens_seen": 34702248, "step": 60155 }, { "epoch": 8.960381292820971, "grad_norm": 0.24712663888931274, "learning_rate": 1.6286381382458803e-06, "loss": 0.828, "num_input_tokens_seen": 34705384, "step": 60160 }, { "epoch": 8.96112600536193, "grad_norm": 0.18657802045345306, "learning_rate": 1.6263316363246184e-06, "loss": 0.7914, "num_input_tokens_seen": 34708168, "step": 60165 }, { "epoch": 8.96187071790289, "grad_norm": 0.19706855714321136, "learning_rate": 1.624026713900839e-06, "loss": 0.7958, "num_input_tokens_seen": 34711080, "step": 60170 }, { "epoch": 8.962615430443849, "grad_norm": 0.22924254834651947, "learning_rate": 1.6217233711302904e-06, "loss": 0.775, "num_input_tokens_seen": 34714184, "step": 60175 }, { "epoch": 8.963360142984808, "grad_norm": 0.24581338465213776, "learning_rate": 1.619421608168628e-06, "loss": 0.8068, "num_input_tokens_seen": 34717000, "step": 60180 }, { "epoch": 8.964104855525767, "grad_norm": 0.23107042908668518, "learning_rate": 1.6171214251713974e-06, "loss": 0.8093, "num_input_tokens_seen": 34719816, "step": 60185 }, { "epoch": 8.964849568066727, "grad_norm": 0.19091083109378815, "learning_rate": 1.6148228222940292e-06, "loss": 0.8147, "num_input_tokens_seen": 34722376, "step": 60190 }, { "epoch": 8.965594280607686, "grad_norm": 0.21617551147937775, "learning_rate": 1.6125257996918609e-06, "loss": 0.8218, "num_input_tokens_seen": 34725128, "step": 60195 }, { "epoch": 8.966338993148645, "grad_norm": 0.16526252031326294, "learning_rate": 1.6102303575201095e-06, "loss": 0.7865, "num_input_tokens_seen": 34728392, "step": 60200 }, { "epoch": 8.967083705689603, "grad_norm": 0.21164202690124512, "learning_rate": 1.6079364959338983e-06, "loss": 0.8293, "num_input_tokens_seen": 34730984, "step": 60205 }, { "epoch": 8.967828418230564, "grad_norm": 0.15328675508499146, "learning_rate": 1.6056442150882283e-06, "loss": 0.8094, "num_input_tokens_seen": 34733864, "step": 60210 }, { "epoch": 8.968573130771523, "grad_norm": 0.2471839338541031, "learning_rate": 1.6033535151380092e-06, "loss": 0.7807, "num_input_tokens_seen": 34736584, "step": 60215 }, { "epoch": 8.969317843312481, "grad_norm": 0.25740736722946167, "learning_rate": 1.6010643962380362e-06, "loss": 0.8319, "num_input_tokens_seen": 34739496, "step": 60220 }, { "epoch": 8.97006255585344, "grad_norm": 0.20565108954906464, "learning_rate": 1.5987768585430025e-06, "loss": 0.7924, "num_input_tokens_seen": 34742184, "step": 60225 }, { "epoch": 8.970807268394399, "grad_norm": 0.21485789120197296, "learning_rate": 1.5964909022074815e-06, "loss": 0.7995, "num_input_tokens_seen": 34745128, "step": 60230 }, { "epoch": 8.97155198093536, "grad_norm": 0.3123572766780853, "learning_rate": 1.5942065273859552e-06, "loss": 0.8191, "num_input_tokens_seen": 34748072, "step": 60235 }, { "epoch": 8.972296693476318, "grad_norm": 0.24402235448360443, "learning_rate": 1.591923734232792e-06, "loss": 0.8374, "num_input_tokens_seen": 34750792, "step": 60240 }, { "epoch": 8.973041406017277, "grad_norm": 0.2146177440881729, "learning_rate": 1.5896425229022488e-06, "loss": 0.8189, "num_input_tokens_seen": 34753480, "step": 60245 }, { "epoch": 8.973786118558236, "grad_norm": 0.21596817672252655, "learning_rate": 1.5873628935484858e-06, "loss": 0.8184, "num_input_tokens_seen": 34756424, "step": 60250 }, { "epoch": 8.974530831099196, "grad_norm": 0.22256411612033844, "learning_rate": 1.585084846325549e-06, "loss": 0.7976, "num_input_tokens_seen": 34759112, "step": 60255 }, { "epoch": 8.975275543640155, "grad_norm": 0.21582208573818207, "learning_rate": 1.5828083813873824e-06, "loss": 0.8052, "num_input_tokens_seen": 34762056, "step": 60260 }, { "epoch": 8.976020256181114, "grad_norm": 0.2560267448425293, "learning_rate": 1.580533498887818e-06, "loss": 0.7918, "num_input_tokens_seen": 34764808, "step": 60265 }, { "epoch": 8.976764968722073, "grad_norm": 0.33366674184799194, "learning_rate": 1.5782601989805857e-06, "loss": 0.8113, "num_input_tokens_seen": 34767528, "step": 60270 }, { "epoch": 8.977509681263033, "grad_norm": 0.21915070712566376, "learning_rate": 1.5759884818192988e-06, "loss": 0.8121, "num_input_tokens_seen": 34770472, "step": 60275 }, { "epoch": 8.978254393803992, "grad_norm": 0.30876535177230835, "learning_rate": 1.5737183475574762e-06, "loss": 0.8089, "num_input_tokens_seen": 34773192, "step": 60280 }, { "epoch": 8.97899910634495, "grad_norm": 0.24001255631446838, "learning_rate": 1.5714497963485203e-06, "loss": 0.8064, "num_input_tokens_seen": 34775880, "step": 60285 }, { "epoch": 8.97974381888591, "grad_norm": 0.15952660143375397, "learning_rate": 1.569182828345736e-06, "loss": 0.7947, "num_input_tokens_seen": 34778600, "step": 60290 }, { "epoch": 8.98048853142687, "grad_norm": 0.18632937967777252, "learning_rate": 1.5669174437023149e-06, "loss": 0.8086, "num_input_tokens_seen": 34781512, "step": 60295 }, { "epoch": 8.981233243967829, "grad_norm": 0.2350456565618515, "learning_rate": 1.5646536425713426e-06, "loss": 0.8014, "num_input_tokens_seen": 34784680, "step": 60300 }, { "epoch": 8.981977956508787, "grad_norm": 0.26950663328170776, "learning_rate": 1.5623914251057942e-06, "loss": 0.799, "num_input_tokens_seen": 34787560, "step": 60305 }, { "epoch": 8.982722669049746, "grad_norm": 0.2764631509780884, "learning_rate": 1.5601307914585416e-06, "loss": 0.8117, "num_input_tokens_seen": 34790344, "step": 60310 }, { "epoch": 8.983467381590707, "grad_norm": 0.20754480361938477, "learning_rate": 1.5578717417823518e-06, "loss": 0.7941, "num_input_tokens_seen": 34793096, "step": 60315 }, { "epoch": 8.984212094131665, "grad_norm": 0.2035318911075592, "learning_rate": 1.5556142762298776e-06, "loss": 0.8276, "num_input_tokens_seen": 34795912, "step": 60320 }, { "epoch": 8.984956806672624, "grad_norm": 0.34641820192337036, "learning_rate": 1.5533583949536745e-06, "loss": 0.8014, "num_input_tokens_seen": 34799112, "step": 60325 }, { "epoch": 8.985701519213583, "grad_norm": 0.29519909620285034, "learning_rate": 1.5511040981061848e-06, "loss": 0.8257, "num_input_tokens_seen": 34801832, "step": 60330 }, { "epoch": 8.986446231754543, "grad_norm": 0.23570892214775085, "learning_rate": 1.5488513858397475e-06, "loss": 0.8167, "num_input_tokens_seen": 34804808, "step": 60335 }, { "epoch": 8.987190944295502, "grad_norm": 0.22037608921527863, "learning_rate": 1.5466002583065825e-06, "loss": 0.8138, "num_input_tokens_seen": 34807688, "step": 60340 }, { "epoch": 8.987935656836461, "grad_norm": 0.22509652376174927, "learning_rate": 1.544350715658821e-06, "loss": 0.7767, "num_input_tokens_seen": 34810696, "step": 60345 }, { "epoch": 8.98868036937742, "grad_norm": 0.1898011863231659, "learning_rate": 1.542102758048472e-06, "loss": 0.7888, "num_input_tokens_seen": 34813640, "step": 60350 }, { "epoch": 8.98942508191838, "grad_norm": 0.25204750895500183, "learning_rate": 1.5398563856274472e-06, "loss": 0.8, "num_input_tokens_seen": 34816520, "step": 60355 }, { "epoch": 8.990169794459339, "grad_norm": 0.16510072350502014, "learning_rate": 1.5376115985475448e-06, "loss": 0.8118, "num_input_tokens_seen": 34819144, "step": 60360 }, { "epoch": 8.990914507000298, "grad_norm": 0.19690507650375366, "learning_rate": 1.535368396960457e-06, "loss": 0.809, "num_input_tokens_seen": 34821928, "step": 60365 }, { "epoch": 8.991659219541257, "grad_norm": 0.2850424647331238, "learning_rate": 1.5331267810177797e-06, "loss": 0.8012, "num_input_tokens_seen": 34824712, "step": 60370 }, { "epoch": 8.992403932082215, "grad_norm": 0.23006348311901093, "learning_rate": 1.53088675087098e-06, "loss": 0.7938, "num_input_tokens_seen": 34827592, "step": 60375 }, { "epoch": 8.993148644623176, "grad_norm": 0.3371950089931488, "learning_rate": 1.5286483066714347e-06, "loss": 0.7876, "num_input_tokens_seen": 34830600, "step": 60380 }, { "epoch": 8.993893357164135, "grad_norm": 0.2843162715435028, "learning_rate": 1.526411448570414e-06, "loss": 0.7992, "num_input_tokens_seen": 34833416, "step": 60385 }, { "epoch": 8.994638069705093, "grad_norm": 0.21263545751571655, "learning_rate": 1.5241761767190665e-06, "loss": 0.7974, "num_input_tokens_seen": 34836104, "step": 60390 }, { "epoch": 8.995382782246054, "grad_norm": 0.2515490651130676, "learning_rate": 1.5219424912684494e-06, "loss": 0.7727, "num_input_tokens_seen": 34839048, "step": 60395 }, { "epoch": 8.996127494787013, "grad_norm": 0.187667116522789, "learning_rate": 1.5197103923695e-06, "loss": 0.7828, "num_input_tokens_seen": 34841832, "step": 60400 }, { "epoch": 8.996872207327971, "grad_norm": 0.22472132742404938, "learning_rate": 1.5174798801730644e-06, "loss": 0.7782, "num_input_tokens_seen": 34844584, "step": 60405 }, { "epoch": 8.99761691986893, "grad_norm": 0.275480180978775, "learning_rate": 1.5152509548298639e-06, "loss": 0.7754, "num_input_tokens_seen": 34847880, "step": 60410 }, { "epoch": 8.998361632409889, "grad_norm": 0.2400190234184265, "learning_rate": 1.5130236164905192e-06, "loss": 0.7954, "num_input_tokens_seen": 34850984, "step": 60415 }, { "epoch": 8.99910634495085, "grad_norm": 0.1777728796005249, "learning_rate": 1.5107978653055466e-06, "loss": 0.8073, "num_input_tokens_seen": 34853736, "step": 60420 }, { "epoch": 8.999851057491808, "grad_norm": 0.1752179116010666, "learning_rate": 1.5085737014253586e-06, "loss": 0.8165, "num_input_tokens_seen": 34856584, "step": 60425 }, { "epoch": 9.0, "eval_loss": 0.8027651309967041, "eval_runtime": 45.1609, "eval_samples_per_second": 66.075, "eval_steps_per_second": 16.519, "num_input_tokens_seen": 34856680, "step": 60426 }, { "epoch": 9.000595770032767, "grad_norm": 0.19411171972751617, "learning_rate": 1.5063511250002466e-06, "loss": 0.7918, "num_input_tokens_seen": 34859112, "step": 60430 }, { "epoch": 9.001340482573726, "grad_norm": 0.16292770206928253, "learning_rate": 1.5041301361804123e-06, "loss": 0.7836, "num_input_tokens_seen": 34862024, "step": 60435 }, { "epoch": 9.002085195114686, "grad_norm": 0.2581808269023895, "learning_rate": 1.5019107351159328e-06, "loss": 0.7954, "num_input_tokens_seen": 34865224, "step": 60440 }, { "epoch": 9.002829907655645, "grad_norm": 0.25991591811180115, "learning_rate": 1.4996929219567884e-06, "loss": 0.747, "num_input_tokens_seen": 34868520, "step": 60445 }, { "epoch": 9.003574620196604, "grad_norm": 0.23907439410686493, "learning_rate": 1.4974766968528508e-06, "loss": 0.7904, "num_input_tokens_seen": 34871400, "step": 60450 }, { "epoch": 9.004319332737563, "grad_norm": 0.14814867079257965, "learning_rate": 1.4952620599538864e-06, "loss": 0.7855, "num_input_tokens_seen": 34874344, "step": 60455 }, { "epoch": 9.005064045278523, "grad_norm": 0.31856250762939453, "learning_rate": 1.4930490114095446e-06, "loss": 0.7972, "num_input_tokens_seen": 34877064, "step": 60460 }, { "epoch": 9.005808757819482, "grad_norm": 0.24983914196491241, "learning_rate": 1.490837551369384e-06, "loss": 0.7926, "num_input_tokens_seen": 34880008, "step": 60465 }, { "epoch": 9.00655347036044, "grad_norm": 0.31922367215156555, "learning_rate": 1.4886276799828402e-06, "loss": 0.7813, "num_input_tokens_seen": 34882824, "step": 60470 }, { "epoch": 9.0072981829014, "grad_norm": 0.2532440721988678, "learning_rate": 1.4864193973992441e-06, "loss": 0.8007, "num_input_tokens_seen": 34885736, "step": 60475 }, { "epoch": 9.00804289544236, "grad_norm": 0.2165810614824295, "learning_rate": 1.484212703767826e-06, "loss": 0.8283, "num_input_tokens_seen": 34888904, "step": 60480 }, { "epoch": 9.008787607983319, "grad_norm": 0.291903018951416, "learning_rate": 1.482007599237706e-06, "loss": 0.7806, "num_input_tokens_seen": 34891816, "step": 60485 }, { "epoch": 9.009532320524277, "grad_norm": 0.20672526955604553, "learning_rate": 1.4798040839578946e-06, "loss": 0.8012, "num_input_tokens_seen": 34894664, "step": 60490 }, { "epoch": 9.010277033065236, "grad_norm": 0.20963245630264282, "learning_rate": 1.4776021580772958e-06, "loss": 0.7899, "num_input_tokens_seen": 34897448, "step": 60495 }, { "epoch": 9.011021745606197, "grad_norm": 0.217063769698143, "learning_rate": 1.4754018217447125e-06, "loss": 0.7921, "num_input_tokens_seen": 34900264, "step": 60500 }, { "epoch": 9.011766458147155, "grad_norm": 0.2516171336174011, "learning_rate": 1.4732030751088255e-06, "loss": 0.7713, "num_input_tokens_seen": 34902952, "step": 60505 }, { "epoch": 9.012511170688114, "grad_norm": 0.1886644810438156, "learning_rate": 1.4710059183182274e-06, "loss": 0.8049, "num_input_tokens_seen": 34905704, "step": 60510 }, { "epoch": 9.013255883229073, "grad_norm": 0.23934100568294525, "learning_rate": 1.4688103515213824e-06, "loss": 0.7997, "num_input_tokens_seen": 34908808, "step": 60515 }, { "epoch": 9.014000595770034, "grad_norm": 0.18875281512737274, "learning_rate": 1.466616374866664e-06, "loss": 0.7702, "num_input_tokens_seen": 34911656, "step": 60520 }, { "epoch": 9.014745308310992, "grad_norm": 0.2531949579715729, "learning_rate": 1.4644239885023309e-06, "loss": 0.7942, "num_input_tokens_seen": 34914568, "step": 60525 }, { "epoch": 9.015490020851951, "grad_norm": 0.21602113544940948, "learning_rate": 1.4622331925765343e-06, "loss": 0.7808, "num_input_tokens_seen": 34917544, "step": 60530 }, { "epoch": 9.01623473339291, "grad_norm": 0.26728329062461853, "learning_rate": 1.460043987237325e-06, "loss": 0.7793, "num_input_tokens_seen": 34920200, "step": 60535 }, { "epoch": 9.01697944593387, "grad_norm": 0.2291601449251175, "learning_rate": 1.457856372632635e-06, "loss": 0.8254, "num_input_tokens_seen": 34923016, "step": 60540 }, { "epoch": 9.017724158474829, "grad_norm": 0.2580372393131256, "learning_rate": 1.4556703489102958e-06, "loss": 0.794, "num_input_tokens_seen": 34925800, "step": 60545 }, { "epoch": 9.018468871015788, "grad_norm": 0.19639964401721954, "learning_rate": 1.4534859162180308e-06, "loss": 0.7896, "num_input_tokens_seen": 34929032, "step": 60550 }, { "epoch": 9.019213583556747, "grad_norm": 0.2020813226699829, "learning_rate": 1.451303074703453e-06, "loss": 0.7948, "num_input_tokens_seen": 34932104, "step": 60555 }, { "epoch": 9.019958296097707, "grad_norm": 0.22336260974407196, "learning_rate": 1.4491218245140715e-06, "loss": 0.7901, "num_input_tokens_seen": 34934952, "step": 60560 }, { "epoch": 9.020703008638666, "grad_norm": 0.2969854176044464, "learning_rate": 1.4469421657972855e-06, "loss": 0.8308, "num_input_tokens_seen": 34937960, "step": 60565 }, { "epoch": 9.021447721179625, "grad_norm": 0.197932168841362, "learning_rate": 1.4447640987003935e-06, "loss": 0.7966, "num_input_tokens_seen": 34941000, "step": 60570 }, { "epoch": 9.022192433720583, "grad_norm": 0.23690319061279297, "learning_rate": 1.4425876233705698e-06, "loss": 0.7871, "num_input_tokens_seen": 34943752, "step": 60575 }, { "epoch": 9.022937146261542, "grad_norm": 0.21593986451625824, "learning_rate": 1.4404127399548966e-06, "loss": 0.778, "num_input_tokens_seen": 34946440, "step": 60580 }, { "epoch": 9.023681858802503, "grad_norm": 0.24521782994270325, "learning_rate": 1.4382394486003454e-06, "loss": 0.7945, "num_input_tokens_seen": 34949448, "step": 60585 }, { "epoch": 9.024426571343461, "grad_norm": 0.21170982718467712, "learning_rate": 1.436067749453779e-06, "loss": 0.7872, "num_input_tokens_seen": 34952424, "step": 60590 }, { "epoch": 9.02517128388442, "grad_norm": 0.27351871132850647, "learning_rate": 1.4338976426619493e-06, "loss": 0.8093, "num_input_tokens_seen": 34955528, "step": 60595 }, { "epoch": 9.025915996425379, "grad_norm": 0.29505231976509094, "learning_rate": 1.431729128371506e-06, "loss": 0.8205, "num_input_tokens_seen": 34958760, "step": 60600 }, { "epoch": 9.02666070896634, "grad_norm": 0.21619343757629395, "learning_rate": 1.4295622067289821e-06, "loss": 0.7824, "num_input_tokens_seen": 34961640, "step": 60605 }, { "epoch": 9.027405421507298, "grad_norm": 0.18402600288391113, "learning_rate": 1.4273968778808155e-06, "loss": 0.806, "num_input_tokens_seen": 34964808, "step": 60610 }, { "epoch": 9.028150134048257, "grad_norm": 0.21009695529937744, "learning_rate": 1.4252331419733283e-06, "loss": 0.7971, "num_input_tokens_seen": 34967464, "step": 60615 }, { "epoch": 9.028894846589216, "grad_norm": 0.2394726723432541, "learning_rate": 1.423070999152737e-06, "loss": 0.8064, "num_input_tokens_seen": 34970696, "step": 60620 }, { "epoch": 9.029639559130176, "grad_norm": 0.20791257917881012, "learning_rate": 1.4209104495651492e-06, "loss": 0.8131, "num_input_tokens_seen": 34973608, "step": 60625 }, { "epoch": 9.030384271671135, "grad_norm": 0.21598109602928162, "learning_rate": 1.4187514933565738e-06, "loss": 0.7939, "num_input_tokens_seen": 34976520, "step": 60630 }, { "epoch": 9.031128984212094, "grad_norm": 0.2464340180158615, "learning_rate": 1.4165941306728963e-06, "loss": 0.8153, "num_input_tokens_seen": 34979496, "step": 60635 }, { "epoch": 9.031873696753053, "grad_norm": 0.2409575879573822, "learning_rate": 1.4144383616599033e-06, "loss": 0.803, "num_input_tokens_seen": 34982376, "step": 60640 }, { "epoch": 9.032618409294013, "grad_norm": 0.20571304857730865, "learning_rate": 1.4122841864632724e-06, "loss": 0.8189, "num_input_tokens_seen": 34985000, "step": 60645 }, { "epoch": 9.033363121834972, "grad_norm": 0.19170698523521423, "learning_rate": 1.4101316052285734e-06, "loss": 0.8092, "num_input_tokens_seen": 34987848, "step": 60650 }, { "epoch": 9.03410783437593, "grad_norm": 0.2244664430618286, "learning_rate": 1.4079806181012733e-06, "loss": 0.8264, "num_input_tokens_seen": 34990376, "step": 60655 }, { "epoch": 9.03485254691689, "grad_norm": 0.19426926970481873, "learning_rate": 1.4058312252267253e-06, "loss": 0.7809, "num_input_tokens_seen": 34993032, "step": 60660 }, { "epoch": 9.03559725945785, "grad_norm": 0.17429935932159424, "learning_rate": 1.4036834267501796e-06, "loss": 0.792, "num_input_tokens_seen": 34995816, "step": 60665 }, { "epoch": 9.036341971998809, "grad_norm": 0.20318183302879333, "learning_rate": 1.4015372228167705e-06, "loss": 0.8052, "num_input_tokens_seen": 34998440, "step": 60670 }, { "epoch": 9.037086684539767, "grad_norm": 0.29490989446640015, "learning_rate": 1.399392613571529e-06, "loss": 0.82, "num_input_tokens_seen": 35001416, "step": 60675 }, { "epoch": 9.037831397080726, "grad_norm": 0.19633671641349792, "learning_rate": 1.3972495991593836e-06, "loss": 0.8344, "num_input_tokens_seen": 35004232, "step": 60680 }, { "epoch": 9.038576109621687, "grad_norm": 0.17879293859004974, "learning_rate": 1.3951081797251463e-06, "loss": 0.7963, "num_input_tokens_seen": 35007016, "step": 60685 }, { "epoch": 9.039320822162646, "grad_norm": 0.20711271464824677, "learning_rate": 1.3929683554135292e-06, "loss": 0.8095, "num_input_tokens_seen": 35009800, "step": 60690 }, { "epoch": 9.040065534703604, "grad_norm": 0.26725754141807556, "learning_rate": 1.3908301263691303e-06, "loss": 0.7748, "num_input_tokens_seen": 35012680, "step": 60695 }, { "epoch": 9.040810247244563, "grad_norm": 0.19087877869606018, "learning_rate": 1.3886934927364454e-06, "loss": 0.8307, "num_input_tokens_seen": 35015304, "step": 60700 }, { "epoch": 9.041554959785524, "grad_norm": 0.22592149674892426, "learning_rate": 1.3865584546598559e-06, "loss": 0.7896, "num_input_tokens_seen": 35018056, "step": 60705 }, { "epoch": 9.042299672326482, "grad_norm": 0.18859608471393585, "learning_rate": 1.384425012283644e-06, "loss": 0.7838, "num_input_tokens_seen": 35020616, "step": 60710 }, { "epoch": 9.043044384867441, "grad_norm": 0.1972351223230362, "learning_rate": 1.3822931657519744e-06, "loss": 0.8076, "num_input_tokens_seen": 35023496, "step": 60715 }, { "epoch": 9.0437890974084, "grad_norm": 0.5320576429367065, "learning_rate": 1.3801629152089073e-06, "loss": 0.8248, "num_input_tokens_seen": 35026408, "step": 60720 }, { "epoch": 9.04453380994936, "grad_norm": 0.16765139997005463, "learning_rate": 1.3780342607983999e-06, "loss": 0.7842, "num_input_tokens_seen": 35029160, "step": 60725 }, { "epoch": 9.04527852249032, "grad_norm": 0.24391892552375793, "learning_rate": 1.3759072026642978e-06, "loss": 0.7819, "num_input_tokens_seen": 35031912, "step": 60730 }, { "epoch": 9.046023235031278, "grad_norm": 0.20401154458522797, "learning_rate": 1.3737817409503417e-06, "loss": 0.8089, "num_input_tokens_seen": 35034856, "step": 60735 }, { "epoch": 9.046767947572237, "grad_norm": 0.2178463190793991, "learning_rate": 1.3716578758001557e-06, "loss": 0.7851, "num_input_tokens_seen": 35037832, "step": 60740 }, { "epoch": 9.047512660113195, "grad_norm": 0.26695552468299866, "learning_rate": 1.3695356073572612e-06, "loss": 0.7921, "num_input_tokens_seen": 35040904, "step": 60745 }, { "epoch": 9.048257372654156, "grad_norm": 0.26143839955329895, "learning_rate": 1.3674149357650822e-06, "loss": 0.8303, "num_input_tokens_seen": 35043688, "step": 60750 }, { "epoch": 9.049002085195115, "grad_norm": 0.22057433426380157, "learning_rate": 1.3652958611669153e-06, "loss": 0.8022, "num_input_tokens_seen": 35046696, "step": 60755 }, { "epoch": 9.049746797736073, "grad_norm": 0.29596006870269775, "learning_rate": 1.3631783837059625e-06, "loss": 0.7696, "num_input_tokens_seen": 35050088, "step": 60760 }, { "epoch": 9.050491510277032, "grad_norm": 0.2300201654434204, "learning_rate": 1.3610625035253178e-06, "loss": 0.8197, "num_input_tokens_seen": 35053000, "step": 60765 }, { "epoch": 9.051236222817993, "grad_norm": 0.2894686758518219, "learning_rate": 1.3589482207679555e-06, "loss": 0.816, "num_input_tokens_seen": 35055752, "step": 60770 }, { "epoch": 9.051980935358952, "grad_norm": 0.24517026543617249, "learning_rate": 1.3568355355767559e-06, "loss": 0.8107, "num_input_tokens_seen": 35058696, "step": 60775 }, { "epoch": 9.05272564789991, "grad_norm": 0.19343020021915436, "learning_rate": 1.3547244480944826e-06, "loss": 0.7898, "num_input_tokens_seen": 35061544, "step": 60780 }, { "epoch": 9.053470360440869, "grad_norm": 0.22653718292713165, "learning_rate": 1.3526149584637993e-06, "loss": 0.802, "num_input_tokens_seen": 35064488, "step": 60785 }, { "epoch": 9.05421507298183, "grad_norm": 0.1825908124446869, "learning_rate": 1.3505070668272556e-06, "loss": 0.7916, "num_input_tokens_seen": 35067176, "step": 60790 }, { "epoch": 9.054959785522788, "grad_norm": 0.18792326748371124, "learning_rate": 1.3484007733272908e-06, "loss": 0.7805, "num_input_tokens_seen": 35069768, "step": 60795 }, { "epoch": 9.055704498063747, "grad_norm": 0.22468329966068268, "learning_rate": 1.3462960781062434e-06, "loss": 0.8376, "num_input_tokens_seen": 35072680, "step": 60800 }, { "epoch": 9.056449210604706, "grad_norm": 0.1553216576576233, "learning_rate": 1.344192981306333e-06, "loss": 0.8054, "num_input_tokens_seen": 35075688, "step": 60805 }, { "epoch": 9.057193923145666, "grad_norm": 0.2047976553440094, "learning_rate": 1.3420914830696851e-06, "loss": 0.7818, "num_input_tokens_seen": 35078664, "step": 60810 }, { "epoch": 9.057938635686625, "grad_norm": 0.2331644594669342, "learning_rate": 1.339991583538308e-06, "loss": 0.8284, "num_input_tokens_seen": 35081576, "step": 60815 }, { "epoch": 9.058683348227584, "grad_norm": 0.5334113836288452, "learning_rate": 1.337893282854108e-06, "loss": 0.7813, "num_input_tokens_seen": 35084744, "step": 60820 }, { "epoch": 9.059428060768543, "grad_norm": 0.1912015825510025, "learning_rate": 1.3357965811588741e-06, "loss": 0.8041, "num_input_tokens_seen": 35087400, "step": 60825 }, { "epoch": 9.060172773309503, "grad_norm": 0.27025261521339417, "learning_rate": 1.3337014785942985e-06, "loss": 0.7881, "num_input_tokens_seen": 35090312, "step": 60830 }, { "epoch": 9.060917485850462, "grad_norm": 0.21718630194664001, "learning_rate": 1.33160797530196e-06, "loss": 0.7964, "num_input_tokens_seen": 35093416, "step": 60835 }, { "epoch": 9.06166219839142, "grad_norm": 0.19502389430999756, "learning_rate": 1.32951607142332e-06, "loss": 0.7758, "num_input_tokens_seen": 35096264, "step": 60840 }, { "epoch": 9.06240691093238, "grad_norm": 0.20033590495586395, "learning_rate": 1.3274257670997464e-06, "loss": 0.821, "num_input_tokens_seen": 35099208, "step": 60845 }, { "epoch": 9.06315162347334, "grad_norm": 0.2663677930831909, "learning_rate": 1.3253370624724953e-06, "loss": 0.8005, "num_input_tokens_seen": 35102024, "step": 60850 }, { "epoch": 9.063896336014299, "grad_norm": 0.21158362925052643, "learning_rate": 1.3232499576827096e-06, "loss": 0.7569, "num_input_tokens_seen": 35105288, "step": 60855 }, { "epoch": 9.064641048555258, "grad_norm": 0.18748021125793457, "learning_rate": 1.321164452871429e-06, "loss": 0.8159, "num_input_tokens_seen": 35107944, "step": 60860 }, { "epoch": 9.065385761096216, "grad_norm": 0.30282947421073914, "learning_rate": 1.319080548179588e-06, "loss": 0.8005, "num_input_tokens_seen": 35111400, "step": 60865 }, { "epoch": 9.066130473637177, "grad_norm": 0.16738353669643402, "learning_rate": 1.316998243748005e-06, "loss": 0.7934, "num_input_tokens_seen": 35114184, "step": 60870 }, { "epoch": 9.066875186178136, "grad_norm": 0.32069045305252075, "learning_rate": 1.3149175397173891e-06, "loss": 0.8184, "num_input_tokens_seen": 35117416, "step": 60875 }, { "epoch": 9.067619898719094, "grad_norm": 0.21078082919120789, "learning_rate": 1.3128384362283474e-06, "loss": 0.8053, "num_input_tokens_seen": 35120648, "step": 60880 }, { "epoch": 9.068364611260053, "grad_norm": 0.2576230764389038, "learning_rate": 1.3107609334213816e-06, "loss": 0.8018, "num_input_tokens_seen": 35123656, "step": 60885 }, { "epoch": 9.069109323801014, "grad_norm": 0.20588083565235138, "learning_rate": 1.3086850314368764e-06, "loss": 0.8093, "num_input_tokens_seen": 35126536, "step": 60890 }, { "epoch": 9.069854036341972, "grad_norm": 0.2321692854166031, "learning_rate": 1.3066107304151142e-06, "loss": 0.8154, "num_input_tokens_seen": 35129704, "step": 60895 }, { "epoch": 9.070598748882931, "grad_norm": 0.21608760952949524, "learning_rate": 1.3045380304962745e-06, "loss": 0.7896, "num_input_tokens_seen": 35132488, "step": 60900 }, { "epoch": 9.07134346142389, "grad_norm": 0.2396460771560669, "learning_rate": 1.302466931820412e-06, "loss": 0.7865, "num_input_tokens_seen": 35135496, "step": 60905 }, { "epoch": 9.07208817396485, "grad_norm": 0.20482583343982697, "learning_rate": 1.3003974345274894e-06, "loss": 0.7836, "num_input_tokens_seen": 35138344, "step": 60910 }, { "epoch": 9.07283288650581, "grad_norm": 0.28032249212265015, "learning_rate": 1.2983295387573507e-06, "loss": 0.7818, "num_input_tokens_seen": 35141000, "step": 60915 }, { "epoch": 9.073577599046768, "grad_norm": 0.3078521490097046, "learning_rate": 1.296263244649737e-06, "loss": 0.7872, "num_input_tokens_seen": 35143784, "step": 60920 }, { "epoch": 9.074322311587727, "grad_norm": 0.20193813741207123, "learning_rate": 1.2941985523442806e-06, "loss": 0.7949, "num_input_tokens_seen": 35146568, "step": 60925 }, { "epoch": 9.075067024128685, "grad_norm": 0.26536640524864197, "learning_rate": 1.2921354619805066e-06, "loss": 0.7973, "num_input_tokens_seen": 35149768, "step": 60930 }, { "epoch": 9.075811736669646, "grad_norm": 0.18345917761325836, "learning_rate": 1.290073973697828e-06, "loss": 0.7745, "num_input_tokens_seen": 35152424, "step": 60935 }, { "epoch": 9.076556449210605, "grad_norm": 0.30464497208595276, "learning_rate": 1.288014087635553e-06, "loss": 0.8101, "num_input_tokens_seen": 35155272, "step": 60940 }, { "epoch": 9.077301161751564, "grad_norm": 0.26442140340805054, "learning_rate": 1.2859558039328784e-06, "loss": 0.8133, "num_input_tokens_seen": 35158216, "step": 60945 }, { "epoch": 9.078045874292522, "grad_norm": 0.1911342740058899, "learning_rate": 1.2838991227289016e-06, "loss": 0.818, "num_input_tokens_seen": 35161096, "step": 60950 }, { "epoch": 9.078790586833483, "grad_norm": 0.18498453497886658, "learning_rate": 1.2818440441625946e-06, "loss": 0.792, "num_input_tokens_seen": 35163848, "step": 60955 }, { "epoch": 9.079535299374442, "grad_norm": 0.19176152348518372, "learning_rate": 1.2797905683728377e-06, "loss": 0.7949, "num_input_tokens_seen": 35166856, "step": 60960 }, { "epoch": 9.0802800119154, "grad_norm": 0.29504016041755676, "learning_rate": 1.2777386954983956e-06, "loss": 0.7935, "num_input_tokens_seen": 35170024, "step": 60965 }, { "epoch": 9.081024724456359, "grad_norm": 0.1746087670326233, "learning_rate": 1.2756884256779234e-06, "loss": 0.7952, "num_input_tokens_seen": 35172712, "step": 60970 }, { "epoch": 9.08176943699732, "grad_norm": 0.1867811679840088, "learning_rate": 1.2736397590499716e-06, "loss": 0.8058, "num_input_tokens_seen": 35175464, "step": 60975 }, { "epoch": 9.082514149538278, "grad_norm": 0.2235095053911209, "learning_rate": 1.2715926957529794e-06, "loss": 0.8074, "num_input_tokens_seen": 35177992, "step": 60980 }, { "epoch": 9.083258862079237, "grad_norm": 0.25864529609680176, "learning_rate": 1.2695472359252808e-06, "loss": 0.8135, "num_input_tokens_seen": 35180680, "step": 60985 }, { "epoch": 9.084003574620196, "grad_norm": 0.2609257996082306, "learning_rate": 1.267503379705104e-06, "loss": 0.7858, "num_input_tokens_seen": 35183592, "step": 60990 }, { "epoch": 9.084748287161156, "grad_norm": 0.2689202129840851, "learning_rate": 1.2654611272305521e-06, "loss": 0.798, "num_input_tokens_seen": 35186280, "step": 60995 }, { "epoch": 9.085492999702115, "grad_norm": 0.2414138913154602, "learning_rate": 1.2634204786396458e-06, "loss": 0.8075, "num_input_tokens_seen": 35189352, "step": 61000 }, { "epoch": 9.086237712243074, "grad_norm": 0.13793130218982697, "learning_rate": 1.2613814340702746e-06, "loss": 0.7948, "num_input_tokens_seen": 35192232, "step": 61005 }, { "epoch": 9.086982424784033, "grad_norm": 0.20502179861068726, "learning_rate": 1.2593439936602308e-06, "loss": 0.7989, "num_input_tokens_seen": 35194856, "step": 61010 }, { "epoch": 9.087727137324993, "grad_norm": 0.3054307699203491, "learning_rate": 1.2573081575471963e-06, "loss": 0.8085, "num_input_tokens_seen": 35197704, "step": 61015 }, { "epoch": 9.088471849865952, "grad_norm": 0.2995537221431732, "learning_rate": 1.2552739258687469e-06, "loss": 0.816, "num_input_tokens_seen": 35200840, "step": 61020 }, { "epoch": 9.08921656240691, "grad_norm": 0.2108304500579834, "learning_rate": 1.2532412987623477e-06, "loss": 0.8049, "num_input_tokens_seen": 35203784, "step": 61025 }, { "epoch": 9.08996127494787, "grad_norm": 0.17844663560390472, "learning_rate": 1.2512102763653556e-06, "loss": 0.7926, "num_input_tokens_seen": 35206600, "step": 61030 }, { "epoch": 9.09070598748883, "grad_norm": 0.2197132557630539, "learning_rate": 1.249180858815019e-06, "loss": 0.7639, "num_input_tokens_seen": 35209544, "step": 61035 }, { "epoch": 9.091450700029789, "grad_norm": 0.2002267837524414, "learning_rate": 1.2471530462484727e-06, "loss": 0.8016, "num_input_tokens_seen": 35212264, "step": 61040 }, { "epoch": 9.092195412570748, "grad_norm": 0.232522651553154, "learning_rate": 1.2451268388027514e-06, "loss": 0.8173, "num_input_tokens_seen": 35215016, "step": 61045 }, { "epoch": 9.092940125111706, "grad_norm": 0.20431780815124512, "learning_rate": 1.2431022366147766e-06, "loss": 0.7988, "num_input_tokens_seen": 35217896, "step": 61050 }, { "epoch": 9.093684837652667, "grad_norm": 0.28586345911026, "learning_rate": 1.2410792398213662e-06, "loss": 0.783, "num_input_tokens_seen": 35220744, "step": 61055 }, { "epoch": 9.094429550193626, "grad_norm": 0.2067302018404007, "learning_rate": 1.2390578485592246e-06, "loss": 0.8098, "num_input_tokens_seen": 35223624, "step": 61060 }, { "epoch": 9.095174262734584, "grad_norm": 0.19025062024593353, "learning_rate": 1.2370380629649486e-06, "loss": 0.7953, "num_input_tokens_seen": 35226408, "step": 61065 }, { "epoch": 9.095918975275543, "grad_norm": 0.21470005810260773, "learning_rate": 1.2350198831750259e-06, "loss": 0.8042, "num_input_tokens_seen": 35229320, "step": 61070 }, { "epoch": 9.096663687816504, "grad_norm": 0.3284193277359009, "learning_rate": 1.233003309325842e-06, "loss": 0.8096, "num_input_tokens_seen": 35232360, "step": 61075 }, { "epoch": 9.097408400357462, "grad_norm": 0.1989697962999344, "learning_rate": 1.23098834155366e-06, "loss": 0.802, "num_input_tokens_seen": 35235464, "step": 61080 }, { "epoch": 9.098153112898421, "grad_norm": 0.20943403244018555, "learning_rate": 1.2289749799946487e-06, "loss": 0.7787, "num_input_tokens_seen": 35238696, "step": 61085 }, { "epoch": 9.09889782543938, "grad_norm": 0.20097720623016357, "learning_rate": 1.2269632247848633e-06, "loss": 0.8041, "num_input_tokens_seen": 35241448, "step": 61090 }, { "epoch": 9.099642537980339, "grad_norm": 0.2589392364025116, "learning_rate": 1.2249530760602534e-06, "loss": 0.7886, "num_input_tokens_seen": 35244200, "step": 61095 }, { "epoch": 9.1003872505213, "grad_norm": 0.2941058874130249, "learning_rate": 1.2229445339566465e-06, "loss": 0.8401, "num_input_tokens_seen": 35247208, "step": 61100 }, { "epoch": 9.101131963062258, "grad_norm": 0.25176945328712463, "learning_rate": 1.2209375986097782e-06, "loss": 0.7995, "num_input_tokens_seen": 35250312, "step": 61105 }, { "epoch": 9.101876675603217, "grad_norm": 0.24184419214725494, "learning_rate": 1.218932270155268e-06, "loss": 0.8099, "num_input_tokens_seen": 35253672, "step": 61110 }, { "epoch": 9.102621388144176, "grad_norm": 0.3159220218658447, "learning_rate": 1.2169285487286325e-06, "loss": 0.7734, "num_input_tokens_seen": 35257064, "step": 61115 }, { "epoch": 9.103366100685136, "grad_norm": 0.33879944682121277, "learning_rate": 1.214926434465266e-06, "loss": 0.8049, "num_input_tokens_seen": 35259784, "step": 61120 }, { "epoch": 9.104110813226095, "grad_norm": 0.22616949677467346, "learning_rate": 1.212925927500469e-06, "loss": 0.8204, "num_input_tokens_seen": 35262632, "step": 61125 }, { "epoch": 9.104855525767054, "grad_norm": 0.2633228600025177, "learning_rate": 1.21092702796943e-06, "loss": 0.8166, "num_input_tokens_seen": 35265544, "step": 61130 }, { "epoch": 9.105600238308012, "grad_norm": 0.2357322722673416, "learning_rate": 1.2089297360072193e-06, "loss": 0.7959, "num_input_tokens_seen": 35268328, "step": 61135 }, { "epoch": 9.106344950848973, "grad_norm": 0.22876334190368652, "learning_rate": 1.2069340517488093e-06, "loss": 0.7966, "num_input_tokens_seen": 35271304, "step": 61140 }, { "epoch": 9.107089663389932, "grad_norm": 0.23899313807487488, "learning_rate": 1.2049399753290612e-06, "loss": 0.8166, "num_input_tokens_seen": 35274120, "step": 61145 }, { "epoch": 9.10783437593089, "grad_norm": 0.2589350938796997, "learning_rate": 1.2029475068827262e-06, "loss": 0.779, "num_input_tokens_seen": 35276680, "step": 61150 }, { "epoch": 9.10857908847185, "grad_norm": 0.19363048672676086, "learning_rate": 1.2009566465444517e-06, "loss": 0.7877, "num_input_tokens_seen": 35279400, "step": 61155 }, { "epoch": 9.10932380101281, "grad_norm": 0.2446020394563675, "learning_rate": 1.1989673944487606e-06, "loss": 0.7925, "num_input_tokens_seen": 35282312, "step": 61160 }, { "epoch": 9.110068513553768, "grad_norm": 0.21822218596935272, "learning_rate": 1.19697975073009e-06, "loss": 0.7582, "num_input_tokens_seen": 35285288, "step": 61165 }, { "epoch": 9.110813226094727, "grad_norm": 0.217117041349411, "learning_rate": 1.194993715522749e-06, "loss": 0.8217, "num_input_tokens_seen": 35288296, "step": 61170 }, { "epoch": 9.111557938635686, "grad_norm": 0.3239993453025818, "learning_rate": 1.1930092889609473e-06, "loss": 0.7786, "num_input_tokens_seen": 35291240, "step": 61175 }, { "epoch": 9.112302651176647, "grad_norm": 0.18409676849842072, "learning_rate": 1.1910264711787855e-06, "loss": 0.8005, "num_input_tokens_seen": 35293960, "step": 61180 }, { "epoch": 9.113047363717605, "grad_norm": 0.275858998298645, "learning_rate": 1.1890452623102566e-06, "loss": 0.8155, "num_input_tokens_seen": 35296744, "step": 61185 }, { "epoch": 9.113792076258564, "grad_norm": 0.28539136052131653, "learning_rate": 1.1870656624892397e-06, "loss": 0.7943, "num_input_tokens_seen": 35299624, "step": 61190 }, { "epoch": 9.114536788799523, "grad_norm": 0.30116263031959534, "learning_rate": 1.1850876718495107e-06, "loss": 0.816, "num_input_tokens_seen": 35302216, "step": 61195 }, { "epoch": 9.115281501340483, "grad_norm": 0.22446541488170624, "learning_rate": 1.1831112905247327e-06, "loss": 0.8275, "num_input_tokens_seen": 35305000, "step": 61200 }, { "epoch": 9.116026213881442, "grad_norm": 0.25147512555122375, "learning_rate": 1.1811365186484595e-06, "loss": 0.7885, "num_input_tokens_seen": 35308136, "step": 61205 }, { "epoch": 9.1167709264224, "grad_norm": 0.3445777893066406, "learning_rate": 1.1791633563541404e-06, "loss": 0.8183, "num_input_tokens_seen": 35311112, "step": 61210 }, { "epoch": 9.11751563896336, "grad_norm": 0.2979466915130615, "learning_rate": 1.1771918037751128e-06, "loss": 0.8444, "num_input_tokens_seen": 35313800, "step": 61215 }, { "epoch": 9.11826035150432, "grad_norm": 0.19575421512126923, "learning_rate": 1.1752218610446037e-06, "loss": 0.8078, "num_input_tokens_seen": 35316584, "step": 61220 }, { "epoch": 9.119005064045279, "grad_norm": 0.2268066555261612, "learning_rate": 1.1732535282957397e-06, "loss": 0.8178, "num_input_tokens_seen": 35319432, "step": 61225 }, { "epoch": 9.119749776586238, "grad_norm": 0.23825642466545105, "learning_rate": 1.171286805661534e-06, "loss": 0.797, "num_input_tokens_seen": 35322216, "step": 61230 }, { "epoch": 9.120494489127196, "grad_norm": 0.26831576228141785, "learning_rate": 1.169321693274883e-06, "loss": 0.7912, "num_input_tokens_seen": 35325096, "step": 61235 }, { "epoch": 9.121239201668157, "grad_norm": 0.21093571186065674, "learning_rate": 1.1673581912685805e-06, "loss": 0.7864, "num_input_tokens_seen": 35327816, "step": 61240 }, { "epoch": 9.121983914209116, "grad_norm": 0.23859329521656036, "learning_rate": 1.1653962997753148e-06, "loss": 0.7934, "num_input_tokens_seen": 35330600, "step": 61245 }, { "epoch": 9.122728626750074, "grad_norm": 0.2381698489189148, "learning_rate": 1.1634360189276632e-06, "loss": 0.8027, "num_input_tokens_seen": 35333768, "step": 61250 }, { "epoch": 9.123473339291033, "grad_norm": 0.2691931426525116, "learning_rate": 1.161477348858095e-06, "loss": 0.8003, "num_input_tokens_seen": 35336488, "step": 61255 }, { "epoch": 9.124218051831992, "grad_norm": 0.282385915517807, "learning_rate": 1.1595202896989677e-06, "loss": 0.8211, "num_input_tokens_seen": 35339272, "step": 61260 }, { "epoch": 9.124962764372953, "grad_norm": 0.17984451353549957, "learning_rate": 1.1575648415825285e-06, "loss": 0.7912, "num_input_tokens_seen": 35342216, "step": 61265 }, { "epoch": 9.125707476913911, "grad_norm": 0.19025565683841705, "learning_rate": 1.1556110046409218e-06, "loss": 0.7874, "num_input_tokens_seen": 35345160, "step": 61270 }, { "epoch": 9.12645218945487, "grad_norm": 0.19292035698890686, "learning_rate": 1.153658779006181e-06, "loss": 0.802, "num_input_tokens_seen": 35347944, "step": 61275 }, { "epoch": 9.127196901995829, "grad_norm": 0.2006848305463791, "learning_rate": 1.151708164810228e-06, "loss": 0.7944, "num_input_tokens_seen": 35350920, "step": 61280 }, { "epoch": 9.12794161453679, "grad_norm": 0.18802224099636078, "learning_rate": 1.1497591621848741e-06, "loss": 0.7885, "num_input_tokens_seen": 35353800, "step": 61285 }, { "epoch": 9.128686327077748, "grad_norm": 0.21880187094211578, "learning_rate": 1.1478117712618281e-06, "loss": 0.7984, "num_input_tokens_seen": 35356840, "step": 61290 }, { "epoch": 9.129431039618707, "grad_norm": 0.2276451736688614, "learning_rate": 1.14586599217269e-06, "loss": 0.7831, "num_input_tokens_seen": 35359400, "step": 61295 }, { "epoch": 9.130175752159666, "grad_norm": 0.21498839557170868, "learning_rate": 1.1439218250489408e-06, "loss": 0.7879, "num_input_tokens_seen": 35362312, "step": 61300 }, { "epoch": 9.130920464700626, "grad_norm": 0.2151619791984558, "learning_rate": 1.1419792700219644e-06, "loss": 0.8166, "num_input_tokens_seen": 35365384, "step": 61305 }, { "epoch": 9.131665177241585, "grad_norm": 0.20031294226646423, "learning_rate": 1.1400383272230281e-06, "loss": 0.7901, "num_input_tokens_seen": 35368424, "step": 61310 }, { "epoch": 9.132409889782544, "grad_norm": 0.3043534457683563, "learning_rate": 1.1380989967832962e-06, "loss": 0.8117, "num_input_tokens_seen": 35371496, "step": 61315 }, { "epoch": 9.133154602323502, "grad_norm": 0.18419961631298065, "learning_rate": 1.1361612788338166e-06, "loss": 0.7705, "num_input_tokens_seen": 35374312, "step": 61320 }, { "epoch": 9.133899314864463, "grad_norm": 0.29347658157348633, "learning_rate": 1.134225173505535e-06, "loss": 0.8054, "num_input_tokens_seen": 35377128, "step": 61325 }, { "epoch": 9.134644027405422, "grad_norm": 0.24532335996627808, "learning_rate": 1.1322906809292877e-06, "loss": 0.8321, "num_input_tokens_seen": 35380168, "step": 61330 }, { "epoch": 9.13538873994638, "grad_norm": 0.20515628159046173, "learning_rate": 1.130357801235793e-06, "loss": 0.8073, "num_input_tokens_seen": 35382856, "step": 61335 }, { "epoch": 9.13613345248734, "grad_norm": 0.26727524399757385, "learning_rate": 1.128426534555674e-06, "loss": 0.7643, "num_input_tokens_seen": 35385960, "step": 61340 }, { "epoch": 9.1368781650283, "grad_norm": 0.18841665983200073, "learning_rate": 1.1264968810194315e-06, "loss": 0.793, "num_input_tokens_seen": 35389064, "step": 61345 }, { "epoch": 9.137622877569259, "grad_norm": 0.25575539469718933, "learning_rate": 1.12456884075747e-06, "loss": 0.8093, "num_input_tokens_seen": 35391944, "step": 61350 }, { "epoch": 9.138367590110217, "grad_norm": 0.39120757579803467, "learning_rate": 1.1226424139000797e-06, "loss": 0.7926, "num_input_tokens_seen": 35395016, "step": 61355 }, { "epoch": 9.139112302651176, "grad_norm": 0.2549489438533783, "learning_rate": 1.120717600577431e-06, "loss": 0.7858, "num_input_tokens_seen": 35397864, "step": 61360 }, { "epoch": 9.139857015192137, "grad_norm": 0.2284753918647766, "learning_rate": 1.1187944009196038e-06, "loss": 0.7954, "num_input_tokens_seen": 35400648, "step": 61365 }, { "epoch": 9.140601727733095, "grad_norm": 0.3144940733909607, "learning_rate": 1.116872815056555e-06, "loss": 0.7945, "num_input_tokens_seen": 35403432, "step": 61370 }, { "epoch": 9.141346440274054, "grad_norm": 0.3640463650226593, "learning_rate": 1.1149528431181417e-06, "loss": 0.8326, "num_input_tokens_seen": 35406728, "step": 61375 }, { "epoch": 9.142091152815013, "grad_norm": 0.21539610624313354, "learning_rate": 1.1130344852341017e-06, "loss": 0.7705, "num_input_tokens_seen": 35409672, "step": 61380 }, { "epoch": 9.142835865355973, "grad_norm": 0.2165866494178772, "learning_rate": 1.1111177415340762e-06, "loss": 0.7978, "num_input_tokens_seen": 35412872, "step": 61385 }, { "epoch": 9.143580577896932, "grad_norm": 0.21165895462036133, "learning_rate": 1.109202612147589e-06, "loss": 0.819, "num_input_tokens_seen": 35415624, "step": 61390 }, { "epoch": 9.14432529043789, "grad_norm": 0.23252761363983154, "learning_rate": 1.1072890972040588e-06, "loss": 0.8029, "num_input_tokens_seen": 35418408, "step": 61395 }, { "epoch": 9.14507000297885, "grad_norm": 0.2196110635995865, "learning_rate": 1.1053771968327908e-06, "loss": 0.7997, "num_input_tokens_seen": 35421160, "step": 61400 }, { "epoch": 9.14581471551981, "grad_norm": 0.17791815102100372, "learning_rate": 1.1034669111629787e-06, "loss": 0.8027, "num_input_tokens_seen": 35424200, "step": 61405 }, { "epoch": 9.146559428060769, "grad_norm": 0.19381846487522125, "learning_rate": 1.101558240323719e-06, "loss": 0.8024, "num_input_tokens_seen": 35426952, "step": 61410 }, { "epoch": 9.147304140601728, "grad_norm": 0.19741547107696533, "learning_rate": 1.0996511844439867e-06, "loss": 0.7662, "num_input_tokens_seen": 35429640, "step": 61415 }, { "epoch": 9.148048853142686, "grad_norm": 0.1820545643568039, "learning_rate": 1.097745743652659e-06, "loss": 0.7914, "num_input_tokens_seen": 35432904, "step": 61420 }, { "epoch": 9.148793565683647, "grad_norm": 0.20262639224529266, "learning_rate": 1.095841918078494e-06, "loss": 0.7866, "num_input_tokens_seen": 35435688, "step": 61425 }, { "epoch": 9.149538278224606, "grad_norm": 0.2077368199825287, "learning_rate": 1.0939397078501445e-06, "loss": 0.7752, "num_input_tokens_seen": 35438504, "step": 61430 }, { "epoch": 9.150282990765565, "grad_norm": 0.2456793487071991, "learning_rate": 1.0920391130961577e-06, "loss": 0.7895, "num_input_tokens_seen": 35441384, "step": 61435 }, { "epoch": 9.151027703306523, "grad_norm": 0.21624024212360382, "learning_rate": 1.0901401339449613e-06, "loss": 0.7756, "num_input_tokens_seen": 35444008, "step": 61440 }, { "epoch": 9.151772415847482, "grad_norm": 0.2128484845161438, "learning_rate": 1.0882427705248832e-06, "loss": 0.7886, "num_input_tokens_seen": 35446856, "step": 61445 }, { "epoch": 9.152517128388443, "grad_norm": 0.16622696816921234, "learning_rate": 1.0863470229641403e-06, "loss": 0.8019, "num_input_tokens_seen": 35449800, "step": 61450 }, { "epoch": 9.153261840929401, "grad_norm": 0.2505236864089966, "learning_rate": 1.0844528913908414e-06, "loss": 0.792, "num_input_tokens_seen": 35452712, "step": 61455 }, { "epoch": 9.15400655347036, "grad_norm": 0.2709740698337555, "learning_rate": 1.0825603759329866e-06, "loss": 0.8062, "num_input_tokens_seen": 35455624, "step": 61460 }, { "epoch": 9.154751266011319, "grad_norm": 0.2600485384464264, "learning_rate": 1.0806694767184545e-06, "loss": 0.7737, "num_input_tokens_seen": 35458760, "step": 61465 }, { "epoch": 9.15549597855228, "grad_norm": 0.31956541538238525, "learning_rate": 1.0787801938750314e-06, "loss": 0.7999, "num_input_tokens_seen": 35461576, "step": 61470 }, { "epoch": 9.156240691093238, "grad_norm": 0.2289946973323822, "learning_rate": 1.0768925275303903e-06, "loss": 0.8096, "num_input_tokens_seen": 35464552, "step": 61475 }, { "epoch": 9.156985403634197, "grad_norm": 0.26478826999664307, "learning_rate": 1.0750064778120822e-06, "loss": 0.8038, "num_input_tokens_seen": 35467560, "step": 61480 }, { "epoch": 9.157730116175156, "grad_norm": 0.3177647590637207, "learning_rate": 1.073122044847566e-06, "loss": 0.8285, "num_input_tokens_seen": 35470632, "step": 61485 }, { "epoch": 9.158474828716116, "grad_norm": 0.3588844835758209, "learning_rate": 1.0712392287641842e-06, "loss": 0.7836, "num_input_tokens_seen": 35473640, "step": 61490 }, { "epoch": 9.159219541257075, "grad_norm": 0.22813260555267334, "learning_rate": 1.0693580296891686e-06, "loss": 0.7832, "num_input_tokens_seen": 35476520, "step": 61495 }, { "epoch": 9.159964253798034, "grad_norm": 0.2086167335510254, "learning_rate": 1.0674784477496396e-06, "loss": 0.8122, "num_input_tokens_seen": 35479144, "step": 61500 }, { "epoch": 9.160708966338992, "grad_norm": 0.17623332142829895, "learning_rate": 1.0656004830726153e-06, "loss": 0.7877, "num_input_tokens_seen": 35481864, "step": 61505 }, { "epoch": 9.161453678879953, "grad_norm": 0.32723239064216614, "learning_rate": 1.0637241357849993e-06, "loss": 0.8205, "num_input_tokens_seen": 35484936, "step": 61510 }, { "epoch": 9.162198391420912, "grad_norm": 0.26385411620140076, "learning_rate": 1.061849406013593e-06, "loss": 0.7862, "num_input_tokens_seen": 35487880, "step": 61515 }, { "epoch": 9.16294310396187, "grad_norm": 0.2740108072757721, "learning_rate": 1.059976293885076e-06, "loss": 0.8125, "num_input_tokens_seen": 35490824, "step": 61520 }, { "epoch": 9.16368781650283, "grad_norm": 0.3493916392326355, "learning_rate": 1.0581047995260246e-06, "loss": 0.821, "num_input_tokens_seen": 35493704, "step": 61525 }, { "epoch": 9.16443252904379, "grad_norm": 0.1931537687778473, "learning_rate": 1.0562349230629154e-06, "loss": 0.8355, "num_input_tokens_seen": 35496712, "step": 61530 }, { "epoch": 9.165177241584749, "grad_norm": 0.21709202229976654, "learning_rate": 1.0543666646221002e-06, "loss": 0.8201, "num_input_tokens_seen": 35499464, "step": 61535 }, { "epoch": 9.165921954125707, "grad_norm": 0.15816493332386017, "learning_rate": 1.0525000243298278e-06, "loss": 0.8261, "num_input_tokens_seen": 35502216, "step": 61540 }, { "epoch": 9.166666666666666, "grad_norm": 0.21750758588314056, "learning_rate": 1.050635002312239e-06, "loss": 0.7954, "num_input_tokens_seen": 35504936, "step": 61545 }, { "epoch": 9.167411379207627, "grad_norm": 0.2709953486919403, "learning_rate": 1.0487715986953695e-06, "loss": 0.8109, "num_input_tokens_seen": 35507688, "step": 61550 }, { "epoch": 9.168156091748585, "grad_norm": 0.20842885971069336, "learning_rate": 1.0469098136051375e-06, "loss": 0.7758, "num_input_tokens_seen": 35510536, "step": 61555 }, { "epoch": 9.168900804289544, "grad_norm": 0.25231003761291504, "learning_rate": 1.045049647167351e-06, "loss": 0.808, "num_input_tokens_seen": 35513352, "step": 61560 }, { "epoch": 9.169645516830503, "grad_norm": 0.23062118887901306, "learning_rate": 1.0431910995077205e-06, "loss": 0.7771, "num_input_tokens_seen": 35516392, "step": 61565 }, { "epoch": 9.170390229371463, "grad_norm": 0.24508842825889587, "learning_rate": 1.0413341707518287e-06, "loss": 0.8388, "num_input_tokens_seen": 35519240, "step": 61570 }, { "epoch": 9.171134941912422, "grad_norm": 0.26725682616233826, "learning_rate": 1.039478861025167e-06, "loss": 0.8147, "num_input_tokens_seen": 35522184, "step": 61575 }, { "epoch": 9.171879654453381, "grad_norm": 0.21775585412979126, "learning_rate": 1.0376251704531049e-06, "loss": 0.8112, "num_input_tokens_seen": 35524968, "step": 61580 }, { "epoch": 9.17262436699434, "grad_norm": 0.2608853876590729, "learning_rate": 1.035773099160911e-06, "loss": 0.787, "num_input_tokens_seen": 35528136, "step": 61585 }, { "epoch": 9.1733690795353, "grad_norm": 0.18283814191818237, "learning_rate": 1.033922647273744e-06, "loss": 0.8173, "num_input_tokens_seen": 35530856, "step": 61590 }, { "epoch": 9.174113792076259, "grad_norm": 0.15116330981254578, "learning_rate": 1.0320738149166397e-06, "loss": 0.7868, "num_input_tokens_seen": 35533640, "step": 61595 }, { "epoch": 9.174858504617218, "grad_norm": 0.22620689868927002, "learning_rate": 1.0302266022145457e-06, "loss": 0.8043, "num_input_tokens_seen": 35536232, "step": 61600 }, { "epoch": 9.175603217158177, "grad_norm": 0.37799209356307983, "learning_rate": 1.0283810092922812e-06, "loss": 0.7981, "num_input_tokens_seen": 35538696, "step": 61605 }, { "epoch": 9.176347929699135, "grad_norm": 0.25818589329719543, "learning_rate": 1.0265370362745663e-06, "loss": 0.7872, "num_input_tokens_seen": 35541608, "step": 61610 }, { "epoch": 9.177092642240096, "grad_norm": 0.27192601561546326, "learning_rate": 1.0246946832860093e-06, "loss": 0.7851, "num_input_tokens_seen": 35544840, "step": 61615 }, { "epoch": 9.177837354781055, "grad_norm": 0.18763072788715363, "learning_rate": 1.0228539504511082e-06, "loss": 0.7924, "num_input_tokens_seen": 35547752, "step": 61620 }, { "epoch": 9.178582067322013, "grad_norm": 0.2510385513305664, "learning_rate": 1.0210148378942573e-06, "loss": 0.8023, "num_input_tokens_seen": 35550600, "step": 61625 }, { "epoch": 9.179326779862972, "grad_norm": 0.19120042026042938, "learning_rate": 1.0191773457397274e-06, "loss": 0.787, "num_input_tokens_seen": 35553800, "step": 61630 }, { "epoch": 9.180071492403933, "grad_norm": 0.16551847755908966, "learning_rate": 1.0173414741116994e-06, "loss": 0.7909, "num_input_tokens_seen": 35556776, "step": 61635 }, { "epoch": 9.180816204944891, "grad_norm": 0.20970575511455536, "learning_rate": 1.015507223134224e-06, "loss": 0.7781, "num_input_tokens_seen": 35559496, "step": 61640 }, { "epoch": 9.18156091748585, "grad_norm": 0.18666326999664307, "learning_rate": 1.0136745929312546e-06, "loss": 0.7947, "num_input_tokens_seen": 35562408, "step": 61645 }, { "epoch": 9.182305630026809, "grad_norm": 0.3413722515106201, "learning_rate": 1.011843583626637e-06, "loss": 0.8492, "num_input_tokens_seen": 35565416, "step": 61650 }, { "epoch": 9.18305034256777, "grad_norm": 0.16333885490894318, "learning_rate": 1.010014195344103e-06, "loss": 0.7902, "num_input_tokens_seen": 35568136, "step": 61655 }, { "epoch": 9.183795055108728, "grad_norm": 0.23509752750396729, "learning_rate": 1.0081864282072722e-06, "loss": 0.7787, "num_input_tokens_seen": 35571208, "step": 61660 }, { "epoch": 9.184539767649687, "grad_norm": 0.19641584157943726, "learning_rate": 1.0063602823396578e-06, "loss": 0.8064, "num_input_tokens_seen": 35574152, "step": 61665 }, { "epoch": 9.185284480190646, "grad_norm": 0.24341954290866852, "learning_rate": 1.0045357578646664e-06, "loss": 0.7969, "num_input_tokens_seen": 35576808, "step": 61670 }, { "epoch": 9.186029192731606, "grad_norm": 0.2551862895488739, "learning_rate": 1.0027128549055881e-06, "loss": 0.7933, "num_input_tokens_seen": 35580168, "step": 61675 }, { "epoch": 9.186773905272565, "grad_norm": 0.19386807084083557, "learning_rate": 1.0008915735856134e-06, "loss": 0.8064, "num_input_tokens_seen": 35583272, "step": 61680 }, { "epoch": 9.187518617813524, "grad_norm": 0.3455545902252197, "learning_rate": 9.990719140278077e-07, "loss": 0.8031, "num_input_tokens_seen": 35586120, "step": 61685 }, { "epoch": 9.188263330354483, "grad_norm": 0.33284372091293335, "learning_rate": 9.972538763551448e-07, "loss": 0.7844, "num_input_tokens_seen": 35589160, "step": 61690 }, { "epoch": 9.189008042895443, "grad_norm": 0.23992229998111725, "learning_rate": 9.954374606904765e-07, "loss": 0.7931, "num_input_tokens_seen": 35591912, "step": 61695 }, { "epoch": 9.189752755436402, "grad_norm": 0.2631608843803406, "learning_rate": 9.936226671565491e-07, "loss": 0.7972, "num_input_tokens_seen": 35594664, "step": 61700 }, { "epoch": 9.19049746797736, "grad_norm": 0.1833207607269287, "learning_rate": 9.91809495875995e-07, "loss": 0.816, "num_input_tokens_seen": 35597576, "step": 61705 }, { "epoch": 9.19124218051832, "grad_norm": 0.27769288420677185, "learning_rate": 9.899979469713494e-07, "loss": 0.8155, "num_input_tokens_seen": 35600776, "step": 61710 }, { "epoch": 9.19198689305928, "grad_norm": 0.2845047116279602, "learning_rate": 9.88188020565023e-07, "loss": 0.8081, "num_input_tokens_seen": 35603656, "step": 61715 }, { "epoch": 9.192731605600239, "grad_norm": 0.25415703654289246, "learning_rate": 9.863797167793286e-07, "loss": 0.8029, "num_input_tokens_seen": 35606280, "step": 61720 }, { "epoch": 9.193476318141197, "grad_norm": 0.21643558144569397, "learning_rate": 9.84573035736455e-07, "loss": 0.805, "num_input_tokens_seen": 35609064, "step": 61725 }, { "epoch": 9.194221030682156, "grad_norm": 0.28641796112060547, "learning_rate": 9.827679775585019e-07, "loss": 0.774, "num_input_tokens_seen": 35612168, "step": 61730 }, { "epoch": 9.194965743223117, "grad_norm": 0.24042989313602448, "learning_rate": 9.80964542367438e-07, "loss": 0.7964, "num_input_tokens_seen": 35614984, "step": 61735 }, { "epoch": 9.195710455764075, "grad_norm": 0.21375635266304016, "learning_rate": 9.79162730285138e-07, "loss": 0.8278, "num_input_tokens_seen": 35617640, "step": 61740 }, { "epoch": 9.196455168305034, "grad_norm": 0.2586604058742523, "learning_rate": 9.773625414333576e-07, "loss": 0.772, "num_input_tokens_seen": 35620616, "step": 61745 }, { "epoch": 9.197199880845993, "grad_norm": 0.27872318029403687, "learning_rate": 9.755639759337466e-07, "loss": 0.7842, "num_input_tokens_seen": 35623336, "step": 61750 }, { "epoch": 9.197944593386953, "grad_norm": 0.26399797201156616, "learning_rate": 9.737670339078491e-07, "loss": 0.784, "num_input_tokens_seen": 35626248, "step": 61755 }, { "epoch": 9.198689305927912, "grad_norm": 0.23180201649665833, "learning_rate": 9.719717154770908e-07, "loss": 0.7978, "num_input_tokens_seen": 35629032, "step": 61760 }, { "epoch": 9.199434018468871, "grad_norm": 0.33926159143447876, "learning_rate": 9.701780207627963e-07, "loss": 0.8288, "num_input_tokens_seen": 35632104, "step": 61765 }, { "epoch": 9.20017873100983, "grad_norm": 0.2023073434829712, "learning_rate": 9.683859498861691e-07, "loss": 0.7919, "num_input_tokens_seen": 35635080, "step": 61770 }, { "epoch": 9.200923443550789, "grad_norm": 0.16607259213924408, "learning_rate": 9.665955029683122e-07, "loss": 0.8018, "num_input_tokens_seen": 35637960, "step": 61775 }, { "epoch": 9.201668156091749, "grad_norm": 0.19757230579853058, "learning_rate": 9.648066801302202e-07, "loss": 0.7905, "num_input_tokens_seen": 35641096, "step": 61780 }, { "epoch": 9.202412868632708, "grad_norm": 0.2302677482366562, "learning_rate": 9.630194814927718e-07, "loss": 0.7898, "num_input_tokens_seen": 35644136, "step": 61785 }, { "epoch": 9.203157581173667, "grad_norm": 0.2234152853488922, "learning_rate": 9.612339071767451e-07, "loss": 0.8188, "num_input_tokens_seen": 35647176, "step": 61790 }, { "epoch": 9.203902293714625, "grad_norm": 0.2099030762910843, "learning_rate": 9.59449957302791e-07, "loss": 0.7871, "num_input_tokens_seen": 35649928, "step": 61795 }, { "epoch": 9.204647006255586, "grad_norm": 0.2610621452331543, "learning_rate": 9.576676319914713e-07, "loss": 0.8049, "num_input_tokens_seen": 35652936, "step": 61800 }, { "epoch": 9.205391718796545, "grad_norm": 0.19212651252746582, "learning_rate": 9.558869313632202e-07, "loss": 0.7891, "num_input_tokens_seen": 35655912, "step": 61805 }, { "epoch": 9.206136431337503, "grad_norm": 0.1801275759935379, "learning_rate": 9.541078555383747e-07, "loss": 0.8071, "num_input_tokens_seen": 35658632, "step": 61810 }, { "epoch": 9.206881143878462, "grad_norm": 0.18372192978858948, "learning_rate": 9.523304046371556e-07, "loss": 0.799, "num_input_tokens_seen": 35661384, "step": 61815 }, { "epoch": 9.207625856419423, "grad_norm": 0.22469794750213623, "learning_rate": 9.505545787796777e-07, "loss": 0.7796, "num_input_tokens_seen": 35664200, "step": 61820 }, { "epoch": 9.208370568960381, "grad_norm": 0.1968991905450821, "learning_rate": 9.48780378085945e-07, "loss": 0.7745, "num_input_tokens_seen": 35667048, "step": 61825 }, { "epoch": 9.20911528150134, "grad_norm": 0.2492022067308426, "learning_rate": 9.470078026758477e-07, "loss": 0.8147, "num_input_tokens_seen": 35670088, "step": 61830 }, { "epoch": 9.209859994042299, "grad_norm": 0.20519597828388214, "learning_rate": 9.452368526691735e-07, "loss": 0.7924, "num_input_tokens_seen": 35672744, "step": 61835 }, { "epoch": 9.21060470658326, "grad_norm": 0.20270304381847382, "learning_rate": 9.434675281855932e-07, "loss": 0.7932, "num_input_tokens_seen": 35675592, "step": 61840 }, { "epoch": 9.211349419124218, "grad_norm": 0.15736371278762817, "learning_rate": 9.416998293446666e-07, "loss": 0.7906, "num_input_tokens_seen": 35678088, "step": 61845 }, { "epoch": 9.212094131665177, "grad_norm": 0.22085325419902802, "learning_rate": 9.399337562658539e-07, "loss": 0.818, "num_input_tokens_seen": 35680840, "step": 61850 }, { "epoch": 9.212838844206136, "grad_norm": 0.2446114718914032, "learning_rate": 9.381693090684957e-07, "loss": 0.8034, "num_input_tokens_seen": 35683752, "step": 61855 }, { "epoch": 9.213583556747096, "grad_norm": 0.24834343791007996, "learning_rate": 9.364064878718298e-07, "loss": 0.7635, "num_input_tokens_seen": 35686568, "step": 61860 }, { "epoch": 9.214328269288055, "grad_norm": 0.16017889976501465, "learning_rate": 9.346452927949778e-07, "loss": 0.8178, "num_input_tokens_seen": 35689320, "step": 61865 }, { "epoch": 9.215072981829014, "grad_norm": 0.29053303599357605, "learning_rate": 9.328857239569527e-07, "loss": 0.7934, "num_input_tokens_seen": 35692200, "step": 61870 }, { "epoch": 9.215817694369973, "grad_norm": 0.16304095089435577, "learning_rate": 9.311277814766595e-07, "loss": 0.7862, "num_input_tokens_seen": 35694888, "step": 61875 }, { "epoch": 9.216562406910933, "grad_norm": 0.2941451370716095, "learning_rate": 9.293714654728974e-07, "loss": 0.8369, "num_input_tokens_seen": 35698024, "step": 61880 }, { "epoch": 9.217307119451892, "grad_norm": 0.18237970769405365, "learning_rate": 9.276167760643439e-07, "loss": 0.7936, "num_input_tokens_seen": 35700744, "step": 61885 }, { "epoch": 9.21805183199285, "grad_norm": 0.22838345170021057, "learning_rate": 9.258637133695791e-07, "loss": 0.7686, "num_input_tokens_seen": 35703752, "step": 61890 }, { "epoch": 9.21879654453381, "grad_norm": 0.22589486837387085, "learning_rate": 9.241122775070693e-07, "loss": 0.8093, "num_input_tokens_seen": 35706440, "step": 61895 }, { "epoch": 9.21954125707477, "grad_norm": 0.1649818867444992, "learning_rate": 9.223624685951615e-07, "loss": 0.8048, "num_input_tokens_seen": 35709352, "step": 61900 }, { "epoch": 9.220285969615729, "grad_norm": 0.20496866106987, "learning_rate": 9.206142867521084e-07, "loss": 0.8083, "num_input_tokens_seen": 35712456, "step": 61905 }, { "epoch": 9.221030682156687, "grad_norm": 0.3102746605873108, "learning_rate": 9.188677320960404e-07, "loss": 0.7852, "num_input_tokens_seen": 35715336, "step": 61910 }, { "epoch": 9.221775394697646, "grad_norm": 0.19321097433567047, "learning_rate": 9.171228047449825e-07, "loss": 0.803, "num_input_tokens_seen": 35718152, "step": 61915 }, { "epoch": 9.222520107238607, "grad_norm": 0.23525626957416534, "learning_rate": 9.153795048168573e-07, "loss": 0.7876, "num_input_tokens_seen": 35720808, "step": 61920 }, { "epoch": 9.223264819779565, "grad_norm": 0.22542662918567657, "learning_rate": 9.136378324294592e-07, "loss": 0.8347, "num_input_tokens_seen": 35723592, "step": 61925 }, { "epoch": 9.224009532320524, "grad_norm": 0.20303966104984283, "learning_rate": 9.118977877004942e-07, "loss": 0.7749, "num_input_tokens_seen": 35726472, "step": 61930 }, { "epoch": 9.224754244861483, "grad_norm": 0.24756725132465363, "learning_rate": 9.101593707475376e-07, "loss": 0.7925, "num_input_tokens_seen": 35729192, "step": 61935 }, { "epoch": 9.225498957402444, "grad_norm": 0.3001101016998291, "learning_rate": 9.084225816880677e-07, "loss": 0.7891, "num_input_tokens_seen": 35732264, "step": 61940 }, { "epoch": 9.226243669943402, "grad_norm": 0.2139638513326645, "learning_rate": 9.066874206394488e-07, "loss": 0.8059, "num_input_tokens_seen": 35735112, "step": 61945 }, { "epoch": 9.226988382484361, "grad_norm": 0.2871970236301422, "learning_rate": 9.049538877189401e-07, "loss": 0.789, "num_input_tokens_seen": 35738024, "step": 61950 }, { "epoch": 9.22773309502532, "grad_norm": 0.209382101893425, "learning_rate": 9.032219830436867e-07, "loss": 0.787, "num_input_tokens_seen": 35740840, "step": 61955 }, { "epoch": 9.228477807566279, "grad_norm": 0.26831528544425964, "learning_rate": 9.014917067307227e-07, "loss": 0.8247, "num_input_tokens_seen": 35743752, "step": 61960 }, { "epoch": 9.229222520107239, "grad_norm": 0.2567800283432007, "learning_rate": 8.997630588969686e-07, "loss": 0.7835, "num_input_tokens_seen": 35746856, "step": 61965 }, { "epoch": 9.229967232648198, "grad_norm": 0.21559785306453705, "learning_rate": 8.980360396592419e-07, "loss": 0.8014, "num_input_tokens_seen": 35750184, "step": 61970 }, { "epoch": 9.230711945189157, "grad_norm": 0.2747771143913269, "learning_rate": 8.963106491342466e-07, "loss": 0.81, "num_input_tokens_seen": 35752936, "step": 61975 }, { "epoch": 9.231456657730115, "grad_norm": 0.14590886235237122, "learning_rate": 8.94586887438581e-07, "loss": 0.8068, "num_input_tokens_seen": 35755720, "step": 61980 }, { "epoch": 9.232201370271076, "grad_norm": 0.2812914252281189, "learning_rate": 8.928647546887269e-07, "loss": 0.7633, "num_input_tokens_seen": 35758632, "step": 61985 }, { "epoch": 9.232946082812035, "grad_norm": 0.18972253799438477, "learning_rate": 8.911442510010637e-07, "loss": 0.8212, "num_input_tokens_seen": 35761288, "step": 61990 }, { "epoch": 9.233690795352993, "grad_norm": 0.2130403220653534, "learning_rate": 8.894253764918509e-07, "loss": 0.7983, "num_input_tokens_seen": 35764040, "step": 61995 }, { "epoch": 9.234435507893952, "grad_norm": 0.2975718080997467, "learning_rate": 8.877081312772456e-07, "loss": 0.8068, "num_input_tokens_seen": 35766920, "step": 62000 }, { "epoch": 9.235180220434913, "grad_norm": 0.20394514501094818, "learning_rate": 8.859925154732885e-07, "loss": 0.7729, "num_input_tokens_seen": 35770024, "step": 62005 }, { "epoch": 9.235924932975871, "grad_norm": 0.1858719438314438, "learning_rate": 8.842785291959199e-07, "loss": 0.7938, "num_input_tokens_seen": 35772776, "step": 62010 }, { "epoch": 9.23666964551683, "grad_norm": 0.19964127242565155, "learning_rate": 8.825661725609585e-07, "loss": 0.8269, "num_input_tokens_seen": 35775528, "step": 62015 }, { "epoch": 9.237414358057789, "grad_norm": 0.27854612469673157, "learning_rate": 8.808554456841201e-07, "loss": 0.814, "num_input_tokens_seen": 35778632, "step": 62020 }, { "epoch": 9.23815907059875, "grad_norm": 0.19582483172416687, "learning_rate": 8.79146348681012e-07, "loss": 0.7911, "num_input_tokens_seen": 35781416, "step": 62025 }, { "epoch": 9.238903783139708, "grad_norm": 0.24629628658294678, "learning_rate": 8.774388816671253e-07, "loss": 0.7908, "num_input_tokens_seen": 35784136, "step": 62030 }, { "epoch": 9.239648495680667, "grad_norm": 0.296822190284729, "learning_rate": 8.757330447578399e-07, "loss": 0.8309, "num_input_tokens_seen": 35787112, "step": 62035 }, { "epoch": 9.240393208221626, "grad_norm": 0.20063526928424835, "learning_rate": 8.740288380684386e-07, "loss": 0.8098, "num_input_tokens_seen": 35790312, "step": 62040 }, { "epoch": 9.241137920762586, "grad_norm": 0.1988063007593155, "learning_rate": 8.723262617140765e-07, "loss": 0.8207, "num_input_tokens_seen": 35793032, "step": 62045 }, { "epoch": 9.241882633303545, "grad_norm": 0.22054293751716614, "learning_rate": 8.706253158098088e-07, "loss": 0.8237, "num_input_tokens_seen": 35795688, "step": 62050 }, { "epoch": 9.242627345844504, "grad_norm": 0.21678653359413147, "learning_rate": 8.689260004705823e-07, "loss": 0.7994, "num_input_tokens_seen": 35798888, "step": 62055 }, { "epoch": 9.243372058385463, "grad_norm": 0.1553701013326645, "learning_rate": 8.672283158112249e-07, "loss": 0.801, "num_input_tokens_seen": 35801544, "step": 62060 }, { "epoch": 9.244116770926423, "grad_norm": 0.18303902447223663, "learning_rate": 8.655322619464612e-07, "loss": 0.7853, "num_input_tokens_seen": 35804424, "step": 62065 }, { "epoch": 9.244861483467382, "grad_norm": 0.28926384449005127, "learning_rate": 8.638378389909052e-07, "loss": 0.797, "num_input_tokens_seen": 35807752, "step": 62070 }, { "epoch": 9.24560619600834, "grad_norm": 0.19485153257846832, "learning_rate": 8.621450470590542e-07, "loss": 0.7905, "num_input_tokens_seen": 35810632, "step": 62075 }, { "epoch": 9.2463509085493, "grad_norm": 0.27891212701797485, "learning_rate": 8.604538862653084e-07, "loss": 0.7931, "num_input_tokens_seen": 35813384, "step": 62080 }, { "epoch": 9.24709562109026, "grad_norm": 0.2187211960554123, "learning_rate": 8.5876435672394e-07, "loss": 0.7982, "num_input_tokens_seen": 35816168, "step": 62085 }, { "epoch": 9.247840333631219, "grad_norm": 0.18437588214874268, "learning_rate": 8.570764585491275e-07, "loss": 0.7911, "num_input_tokens_seen": 35819112, "step": 62090 }, { "epoch": 9.248585046172177, "grad_norm": 0.22528022527694702, "learning_rate": 8.553901918549323e-07, "loss": 0.8202, "num_input_tokens_seen": 35821992, "step": 62095 }, { "epoch": 9.249329758713136, "grad_norm": 0.19898930191993713, "learning_rate": 8.537055567552993e-07, "loss": 0.7618, "num_input_tokens_seen": 35824904, "step": 62100 }, { "epoch": 9.250074471254097, "grad_norm": 0.16808153688907623, "learning_rate": 8.520225533640735e-07, "loss": 0.8034, "num_input_tokens_seen": 35827784, "step": 62105 }, { "epoch": 9.250819183795056, "grad_norm": 0.22923782467842102, "learning_rate": 8.503411817949863e-07, "loss": 0.7988, "num_input_tokens_seen": 35830664, "step": 62110 }, { "epoch": 9.251563896336014, "grad_norm": 0.24408796429634094, "learning_rate": 8.486614421616551e-07, "loss": 0.7914, "num_input_tokens_seen": 35833544, "step": 62115 }, { "epoch": 9.252308608876973, "grad_norm": 0.18849053978919983, "learning_rate": 8.469833345775946e-07, "loss": 0.8053, "num_input_tokens_seen": 35836264, "step": 62120 }, { "epoch": 9.253053321417934, "grad_norm": 0.1746872067451477, "learning_rate": 8.453068591562003e-07, "loss": 0.8081, "num_input_tokens_seen": 35839080, "step": 62125 }, { "epoch": 9.253798033958892, "grad_norm": 0.19594833254814148, "learning_rate": 8.436320160107619e-07, "loss": 0.8046, "num_input_tokens_seen": 35841832, "step": 62130 }, { "epoch": 9.254542746499851, "grad_norm": 0.20253995060920715, "learning_rate": 8.419588052544586e-07, "loss": 0.8056, "num_input_tokens_seen": 35844520, "step": 62135 }, { "epoch": 9.25528745904081, "grad_norm": 0.18998874723911285, "learning_rate": 8.402872270003582e-07, "loss": 0.8071, "num_input_tokens_seen": 35847400, "step": 62140 }, { "epoch": 9.256032171581769, "grad_norm": 0.24199612438678741, "learning_rate": 8.386172813614229e-07, "loss": 0.7978, "num_input_tokens_seen": 35850184, "step": 62145 }, { "epoch": 9.25677688412273, "grad_norm": 0.25541749596595764, "learning_rate": 8.369489684504961e-07, "loss": 0.7756, "num_input_tokens_seen": 35853256, "step": 62150 }, { "epoch": 9.257521596663688, "grad_norm": 0.263380229473114, "learning_rate": 8.352822883803235e-07, "loss": 0.8397, "num_input_tokens_seen": 35856104, "step": 62155 }, { "epoch": 9.258266309204647, "grad_norm": 0.19500099122524261, "learning_rate": 8.336172412635263e-07, "loss": 0.8215, "num_input_tokens_seen": 35858664, "step": 62160 }, { "epoch": 9.259011021745605, "grad_norm": 0.2201622724533081, "learning_rate": 8.319538272126198e-07, "loss": 0.8237, "num_input_tokens_seen": 35861512, "step": 62165 }, { "epoch": 9.259755734286566, "grad_norm": 0.1881481260061264, "learning_rate": 8.302920463400143e-07, "loss": 0.795, "num_input_tokens_seen": 35864328, "step": 62170 }, { "epoch": 9.260500446827525, "grad_norm": 0.2845439910888672, "learning_rate": 8.286318987580061e-07, "loss": 0.8178, "num_input_tokens_seen": 35867144, "step": 62175 }, { "epoch": 9.261245159368483, "grad_norm": 0.24504218995571136, "learning_rate": 8.269733845787775e-07, "loss": 0.7954, "num_input_tokens_seen": 35870056, "step": 62180 }, { "epoch": 9.261989871909442, "grad_norm": 0.21251262724399567, "learning_rate": 8.253165039144111e-07, "loss": 0.8196, "num_input_tokens_seen": 35873000, "step": 62185 }, { "epoch": 9.262734584450403, "grad_norm": 0.2781949043273926, "learning_rate": 8.236612568768676e-07, "loss": 0.8266, "num_input_tokens_seen": 35875976, "step": 62190 }, { "epoch": 9.263479296991362, "grad_norm": 0.19857648015022278, "learning_rate": 8.220076435780016e-07, "loss": 0.789, "num_input_tokens_seen": 35878696, "step": 62195 }, { "epoch": 9.26422400953232, "grad_norm": 0.22353029251098633, "learning_rate": 8.203556641295601e-07, "loss": 0.7801, "num_input_tokens_seen": 35881800, "step": 62200 }, { "epoch": 9.264968722073279, "grad_norm": 0.3312164843082428, "learning_rate": 8.187053186431731e-07, "loss": 0.804, "num_input_tokens_seen": 35885160, "step": 62205 }, { "epoch": 9.26571343461424, "grad_norm": 0.2132532000541687, "learning_rate": 8.170566072303681e-07, "loss": 0.7935, "num_input_tokens_seen": 35888008, "step": 62210 }, { "epoch": 9.266458147155198, "grad_norm": 0.2658476233482361, "learning_rate": 8.15409530002556e-07, "loss": 0.7964, "num_input_tokens_seen": 35891112, "step": 62215 }, { "epoch": 9.267202859696157, "grad_norm": 0.2647756338119507, "learning_rate": 8.137640870710395e-07, "loss": 0.7863, "num_input_tokens_seen": 35893768, "step": 62220 }, { "epoch": 9.267947572237116, "grad_norm": 0.3840947151184082, "learning_rate": 8.121202785470156e-07, "loss": 0.8076, "num_input_tokens_seen": 35896616, "step": 62225 }, { "epoch": 9.268692284778076, "grad_norm": 0.25843581557273865, "learning_rate": 8.104781045415594e-07, "loss": 0.7834, "num_input_tokens_seen": 35899400, "step": 62230 }, { "epoch": 9.269436997319035, "grad_norm": 0.2264758050441742, "learning_rate": 8.08837565165646e-07, "loss": 0.7859, "num_input_tokens_seen": 35902312, "step": 62235 }, { "epoch": 9.270181709859994, "grad_norm": 0.24178630113601685, "learning_rate": 8.071986605301396e-07, "loss": 0.816, "num_input_tokens_seen": 35905448, "step": 62240 }, { "epoch": 9.270926422400953, "grad_norm": 0.21078933775424957, "learning_rate": 8.055613907457821e-07, "loss": 0.7956, "num_input_tokens_seen": 35908680, "step": 62245 }, { "epoch": 9.271671134941913, "grad_norm": 0.38941970467567444, "learning_rate": 8.039257559232182e-07, "loss": 0.8195, "num_input_tokens_seen": 35911688, "step": 62250 }, { "epoch": 9.272415847482872, "grad_norm": 0.23026764392852783, "learning_rate": 8.022917561729793e-07, "loss": 0.7933, "num_input_tokens_seen": 35914536, "step": 62255 }, { "epoch": 9.27316056002383, "grad_norm": 0.17446930706501007, "learning_rate": 8.00659391605485e-07, "loss": 0.8187, "num_input_tokens_seen": 35917576, "step": 62260 }, { "epoch": 9.27390527256479, "grad_norm": 0.17300385236740112, "learning_rate": 7.990286623310389e-07, "loss": 0.8128, "num_input_tokens_seen": 35920296, "step": 62265 }, { "epoch": 9.27464998510575, "grad_norm": 0.2072194367647171, "learning_rate": 7.973995684598418e-07, "loss": 0.7778, "num_input_tokens_seen": 35923112, "step": 62270 }, { "epoch": 9.275394697646709, "grad_norm": 0.2874036431312561, "learning_rate": 7.957721101019805e-07, "loss": 0.8013, "num_input_tokens_seen": 35926248, "step": 62275 }, { "epoch": 9.276139410187668, "grad_norm": 0.2125653773546219, "learning_rate": 7.941462873674338e-07, "loss": 0.8078, "num_input_tokens_seen": 35929256, "step": 62280 }, { "epoch": 9.276884122728626, "grad_norm": 0.2682957053184509, "learning_rate": 7.925221003660694e-07, "loss": 0.7788, "num_input_tokens_seen": 35931784, "step": 62285 }, { "epoch": 9.277628835269585, "grad_norm": 0.23134973645210266, "learning_rate": 7.90899549207641e-07, "loss": 0.7632, "num_input_tokens_seen": 35934696, "step": 62290 }, { "epoch": 9.278373547810546, "grad_norm": 0.244641974568367, "learning_rate": 7.892786340017916e-07, "loss": 0.8111, "num_input_tokens_seen": 35937512, "step": 62295 }, { "epoch": 9.279118260351504, "grad_norm": 0.20429515838623047, "learning_rate": 7.876593548580585e-07, "loss": 0.7869, "num_input_tokens_seen": 35940392, "step": 62300 }, { "epoch": 9.279862972892463, "grad_norm": 0.19740088284015656, "learning_rate": 7.860417118858654e-07, "loss": 0.7747, "num_input_tokens_seen": 35943240, "step": 62305 }, { "epoch": 9.280607685433422, "grad_norm": 0.31428441405296326, "learning_rate": 7.844257051945275e-07, "loss": 0.7849, "num_input_tokens_seen": 35946280, "step": 62310 }, { "epoch": 9.281352397974382, "grad_norm": 0.22641929984092712, "learning_rate": 7.828113348932464e-07, "loss": 0.8156, "num_input_tokens_seen": 35948936, "step": 62315 }, { "epoch": 9.282097110515341, "grad_norm": 0.19993247091770172, "learning_rate": 7.811986010911182e-07, "loss": 0.8122, "num_input_tokens_seen": 35952392, "step": 62320 }, { "epoch": 9.2828418230563, "grad_norm": 0.16382601857185364, "learning_rate": 7.795875038971223e-07, "loss": 0.8025, "num_input_tokens_seen": 35955112, "step": 62325 }, { "epoch": 9.283586535597259, "grad_norm": 0.1524239480495453, "learning_rate": 7.779780434201273e-07, "loss": 0.7958, "num_input_tokens_seen": 35957928, "step": 62330 }, { "epoch": 9.28433124813822, "grad_norm": 0.24539171159267426, "learning_rate": 7.76370219768896e-07, "loss": 0.8123, "num_input_tokens_seen": 35961032, "step": 62335 }, { "epoch": 9.285075960679178, "grad_norm": 0.20560823380947113, "learning_rate": 7.747640330520805e-07, "loss": 0.7864, "num_input_tokens_seen": 35964040, "step": 62340 }, { "epoch": 9.285820673220137, "grad_norm": 0.14228267967700958, "learning_rate": 7.731594833782191e-07, "loss": 0.7956, "num_input_tokens_seen": 35966696, "step": 62345 }, { "epoch": 9.286565385761095, "grad_norm": 0.18538856506347656, "learning_rate": 7.715565708557387e-07, "loss": 0.7819, "num_input_tokens_seen": 35969480, "step": 62350 }, { "epoch": 9.287310098302056, "grad_norm": 0.2939034700393677, "learning_rate": 7.69955295592964e-07, "loss": 0.8018, "num_input_tokens_seen": 35972296, "step": 62355 }, { "epoch": 9.288054810843015, "grad_norm": 0.21294766664505005, "learning_rate": 7.683556576980944e-07, "loss": 0.7754, "num_input_tokens_seen": 35975144, "step": 62360 }, { "epoch": 9.288799523383974, "grad_norm": 0.18667908012866974, "learning_rate": 7.667576572792323e-07, "loss": 0.7989, "num_input_tokens_seen": 35977896, "step": 62365 }, { "epoch": 9.289544235924932, "grad_norm": 0.23406068980693817, "learning_rate": 7.651612944443609e-07, "loss": 0.823, "num_input_tokens_seen": 35980712, "step": 62370 }, { "epoch": 9.290288948465893, "grad_norm": 0.1685478687286377, "learning_rate": 7.635665693013577e-07, "loss": 0.7914, "num_input_tokens_seen": 35983496, "step": 62375 }, { "epoch": 9.291033661006852, "grad_norm": 0.18356989324092865, "learning_rate": 7.619734819579893e-07, "loss": 0.7705, "num_input_tokens_seen": 35986024, "step": 62380 }, { "epoch": 9.29177837354781, "grad_norm": 0.22399446368217468, "learning_rate": 7.603820325219058e-07, "loss": 0.8266, "num_input_tokens_seen": 35988808, "step": 62385 }, { "epoch": 9.292523086088769, "grad_norm": 0.3893227279186249, "learning_rate": 7.58792221100657e-07, "loss": 0.7975, "num_input_tokens_seen": 35991720, "step": 62390 }, { "epoch": 9.29326779862973, "grad_norm": 0.2786152958869934, "learning_rate": 7.572040478016712e-07, "loss": 0.8117, "num_input_tokens_seen": 35994440, "step": 62395 }, { "epoch": 9.294012511170688, "grad_norm": 0.21772032976150513, "learning_rate": 7.556175127322707e-07, "loss": 0.7938, "num_input_tokens_seen": 35997096, "step": 62400 }, { "epoch": 9.294757223711647, "grad_norm": 0.20930267870426178, "learning_rate": 7.540326159996697e-07, "loss": 0.8079, "num_input_tokens_seen": 36000168, "step": 62405 }, { "epoch": 9.295501936252606, "grad_norm": 0.2229377180337906, "learning_rate": 7.524493577109659e-07, "loss": 0.8059, "num_input_tokens_seen": 36003080, "step": 62410 }, { "epoch": 9.296246648793566, "grad_norm": 0.2424517124891281, "learning_rate": 7.508677379731515e-07, "loss": 0.7889, "num_input_tokens_seen": 36005896, "step": 62415 }, { "epoch": 9.296991361334525, "grad_norm": 0.200967937707901, "learning_rate": 7.49287756893105e-07, "loss": 0.832, "num_input_tokens_seen": 36008744, "step": 62420 }, { "epoch": 9.297736073875484, "grad_norm": 0.2768764793872833, "learning_rate": 7.477094145775993e-07, "loss": 0.7926, "num_input_tokens_seen": 36011912, "step": 62425 }, { "epoch": 9.298480786416443, "grad_norm": 0.17925438284873962, "learning_rate": 7.46132711133285e-07, "loss": 0.7877, "num_input_tokens_seen": 36014920, "step": 62430 }, { "epoch": 9.299225498957403, "grad_norm": 0.7302107214927673, "learning_rate": 7.445576466667131e-07, "loss": 0.8021, "num_input_tokens_seen": 36017992, "step": 62435 }, { "epoch": 9.299970211498362, "grad_norm": 0.227450430393219, "learning_rate": 7.429842212843208e-07, "loss": 0.792, "num_input_tokens_seen": 36020744, "step": 62440 }, { "epoch": 9.30071492403932, "grad_norm": 0.2777613699436188, "learning_rate": 7.41412435092434e-07, "loss": 0.7772, "num_input_tokens_seen": 36023304, "step": 62445 }, { "epoch": 9.30145963658028, "grad_norm": 0.21496614813804626, "learning_rate": 7.39842288197265e-07, "loss": 0.785, "num_input_tokens_seen": 36026056, "step": 62450 }, { "epoch": 9.30220434912124, "grad_norm": 0.18663443624973297, "learning_rate": 7.382737807049233e-07, "loss": 0.7844, "num_input_tokens_seen": 36028680, "step": 62455 }, { "epoch": 9.302949061662199, "grad_norm": 0.19891351461410522, "learning_rate": 7.367069127213938e-07, "loss": 0.7913, "num_input_tokens_seen": 36031560, "step": 62460 }, { "epoch": 9.303693774203158, "grad_norm": 0.21164144575595856, "learning_rate": 7.351416843525638e-07, "loss": 0.7884, "num_input_tokens_seen": 36034440, "step": 62465 }, { "epoch": 9.304438486744116, "grad_norm": 0.23271827399730682, "learning_rate": 7.335780957042071e-07, "loss": 0.8046, "num_input_tokens_seen": 36037288, "step": 62470 }, { "epoch": 9.305183199285075, "grad_norm": 0.17257502675056458, "learning_rate": 7.320161468819808e-07, "loss": 0.7778, "num_input_tokens_seen": 36040264, "step": 62475 }, { "epoch": 9.305927911826036, "grad_norm": 0.2469954937696457, "learning_rate": 7.304558379914395e-07, "loss": 0.8186, "num_input_tokens_seen": 36043016, "step": 62480 }, { "epoch": 9.306672624366994, "grad_norm": 0.21269308030605316, "learning_rate": 7.288971691380209e-07, "loss": 0.8052, "num_input_tokens_seen": 36045800, "step": 62485 }, { "epoch": 9.307417336907953, "grad_norm": 0.26142293214797974, "learning_rate": 7.273401404270519e-07, "loss": 0.8207, "num_input_tokens_seen": 36048776, "step": 62490 }, { "epoch": 9.308162049448912, "grad_norm": 0.19224239885807037, "learning_rate": 7.257847519637484e-07, "loss": 0.7968, "num_input_tokens_seen": 36051816, "step": 62495 }, { "epoch": 9.308906761989872, "grad_norm": 0.22256070375442505, "learning_rate": 7.24231003853218e-07, "loss": 0.8088, "num_input_tokens_seen": 36054472, "step": 62500 }, { "epoch": 9.309651474530831, "grad_norm": 0.22287267446517944, "learning_rate": 7.2267889620046e-07, "loss": 0.8365, "num_input_tokens_seen": 36057256, "step": 62505 }, { "epoch": 9.31039618707179, "grad_norm": 0.34072500467300415, "learning_rate": 7.21128429110357e-07, "loss": 0.8184, "num_input_tokens_seen": 36060264, "step": 62510 }, { "epoch": 9.311140899612749, "grad_norm": 0.22879275679588318, "learning_rate": 7.195796026876866e-07, "loss": 0.7892, "num_input_tokens_seen": 36063016, "step": 62515 }, { "epoch": 9.31188561215371, "grad_norm": 0.2035953253507614, "learning_rate": 7.180324170371095e-07, "loss": 0.8004, "num_input_tokens_seen": 36065832, "step": 62520 }, { "epoch": 9.312630324694668, "grad_norm": 0.2026851624250412, "learning_rate": 7.164868722631807e-07, "loss": 0.7985, "num_input_tokens_seen": 36068744, "step": 62525 }, { "epoch": 9.313375037235627, "grad_norm": 0.263405442237854, "learning_rate": 7.149429684703335e-07, "loss": 0.7969, "num_input_tokens_seen": 36071656, "step": 62530 }, { "epoch": 9.314119749776586, "grad_norm": 0.25940388441085815, "learning_rate": 7.134007057629066e-07, "loss": 0.7726, "num_input_tokens_seen": 36074440, "step": 62535 }, { "epoch": 9.314864462317546, "grad_norm": 0.3951241672039032, "learning_rate": 7.118600842451195e-07, "loss": 0.8323, "num_input_tokens_seen": 36077672, "step": 62540 }, { "epoch": 9.315609174858505, "grad_norm": 0.1663012057542801, "learning_rate": 7.103211040210778e-07, "loss": 0.7943, "num_input_tokens_seen": 36080360, "step": 62545 }, { "epoch": 9.316353887399464, "grad_norm": 0.18688218295574188, "learning_rate": 7.087837651947815e-07, "loss": 0.7843, "num_input_tokens_seen": 36083336, "step": 62550 }, { "epoch": 9.317098599940422, "grad_norm": 0.32195281982421875, "learning_rate": 7.072480678701198e-07, "loss": 0.7669, "num_input_tokens_seen": 36086312, "step": 62555 }, { "epoch": 9.317843312481383, "grad_norm": 0.31704744696617126, "learning_rate": 7.057140121508627e-07, "loss": 0.8155, "num_input_tokens_seen": 36089064, "step": 62560 }, { "epoch": 9.318588025022342, "grad_norm": 0.24725943803787231, "learning_rate": 7.041815981406852e-07, "loss": 0.799, "num_input_tokens_seen": 36091784, "step": 62565 }, { "epoch": 9.3193327375633, "grad_norm": 0.2713852822780609, "learning_rate": 7.026508259431297e-07, "loss": 0.8139, "num_input_tokens_seen": 36094536, "step": 62570 }, { "epoch": 9.32007745010426, "grad_norm": 0.19947747886180878, "learning_rate": 7.011216956616467e-07, "loss": 0.8293, "num_input_tokens_seen": 36097512, "step": 62575 }, { "epoch": 9.32082216264522, "grad_norm": 0.1911170929670334, "learning_rate": 6.995942073995676e-07, "loss": 0.783, "num_input_tokens_seen": 36100200, "step": 62580 }, { "epoch": 9.321566875186178, "grad_norm": 0.23541806638240814, "learning_rate": 6.980683612601152e-07, "loss": 0.7962, "num_input_tokens_seen": 36103016, "step": 62585 }, { "epoch": 9.322311587727137, "grad_norm": 0.2146725058555603, "learning_rate": 6.965441573463988e-07, "loss": 0.8109, "num_input_tokens_seen": 36105736, "step": 62590 }, { "epoch": 9.323056300268096, "grad_norm": 0.1577891707420349, "learning_rate": 6.950215957614164e-07, "loss": 0.8145, "num_input_tokens_seen": 36108552, "step": 62595 }, { "epoch": 9.323801012809056, "grad_norm": 0.2651512920856476, "learning_rate": 6.935006766080582e-07, "loss": 0.7985, "num_input_tokens_seen": 36111432, "step": 62600 }, { "epoch": 9.324545725350015, "grad_norm": 0.20941834151744843, "learning_rate": 6.919813999891028e-07, "loss": 0.81, "num_input_tokens_seen": 36114088, "step": 62605 }, { "epoch": 9.325290437890974, "grad_norm": 0.2026820331811905, "learning_rate": 6.904637660072128e-07, "loss": 0.7884, "num_input_tokens_seen": 36117064, "step": 62610 }, { "epoch": 9.326035150431933, "grad_norm": 0.18431203067302704, "learning_rate": 6.889477747649447e-07, "loss": 0.7729, "num_input_tokens_seen": 36120232, "step": 62615 }, { "epoch": 9.326779862972893, "grad_norm": 0.19052596390247345, "learning_rate": 6.874334263647503e-07, "loss": 0.7887, "num_input_tokens_seen": 36122888, "step": 62620 }, { "epoch": 9.327524575513852, "grad_norm": 0.27281779050827026, "learning_rate": 6.859207209089502e-07, "loss": 0.7882, "num_input_tokens_seen": 36125864, "step": 62625 }, { "epoch": 9.32826928805481, "grad_norm": 0.19339610636234283, "learning_rate": 6.844096584997767e-07, "loss": 0.8199, "num_input_tokens_seen": 36128584, "step": 62630 }, { "epoch": 9.32901400059577, "grad_norm": 0.19024094939231873, "learning_rate": 6.829002392393396e-07, "loss": 0.8028, "num_input_tokens_seen": 36131336, "step": 62635 }, { "epoch": 9.32975871313673, "grad_norm": 0.24752134084701538, "learning_rate": 6.813924632296353e-07, "loss": 0.7767, "num_input_tokens_seen": 36134216, "step": 62640 }, { "epoch": 9.330503425677689, "grad_norm": 0.20926080644130707, "learning_rate": 6.798863305725628e-07, "loss": 0.8152, "num_input_tokens_seen": 36136872, "step": 62645 }, { "epoch": 9.331248138218648, "grad_norm": 0.36791425943374634, "learning_rate": 6.783818413698878e-07, "loss": 0.8023, "num_input_tokens_seen": 36140264, "step": 62650 }, { "epoch": 9.331992850759606, "grad_norm": 0.2018653005361557, "learning_rate": 6.768789957232874e-07, "loss": 0.8071, "num_input_tokens_seen": 36143048, "step": 62655 }, { "epoch": 9.332737563300565, "grad_norm": 0.19803453981876373, "learning_rate": 6.753777937343109e-07, "loss": 0.7861, "num_input_tokens_seen": 36145768, "step": 62660 }, { "epoch": 9.333482275841526, "grad_norm": 0.20178896188735962, "learning_rate": 6.738782355044049e-07, "loss": 0.7761, "num_input_tokens_seen": 36148712, "step": 62665 }, { "epoch": 9.334226988382484, "grad_norm": 0.32784077525138855, "learning_rate": 6.72380321134905e-07, "loss": 0.8013, "num_input_tokens_seen": 36151496, "step": 62670 }, { "epoch": 9.334971700923443, "grad_norm": 0.17288951575756073, "learning_rate": 6.708840507270359e-07, "loss": 0.7864, "num_input_tokens_seen": 36154280, "step": 62675 }, { "epoch": 9.335716413464402, "grad_norm": 0.2277686893939972, "learning_rate": 6.693894243819082e-07, "loss": 0.7974, "num_input_tokens_seen": 36157256, "step": 62680 }, { "epoch": 9.336461126005362, "grad_norm": 0.19892294704914093, "learning_rate": 6.678964422005218e-07, "loss": 0.7734, "num_input_tokens_seen": 36160264, "step": 62685 }, { "epoch": 9.337205838546321, "grad_norm": 0.18326927721500397, "learning_rate": 6.66405104283771e-07, "loss": 0.8096, "num_input_tokens_seen": 36163016, "step": 62690 }, { "epoch": 9.33795055108728, "grad_norm": 0.21220073103904724, "learning_rate": 6.649154107324251e-07, "loss": 0.7645, "num_input_tokens_seen": 36165896, "step": 62695 }, { "epoch": 9.338695263628239, "grad_norm": 0.1882115602493286, "learning_rate": 6.634273616471565e-07, "loss": 0.8111, "num_input_tokens_seen": 36168904, "step": 62700 }, { "epoch": 9.3394399761692, "grad_norm": 0.19315290451049805, "learning_rate": 6.619409571285206e-07, "loss": 0.7593, "num_input_tokens_seen": 36171880, "step": 62705 }, { "epoch": 9.340184688710158, "grad_norm": 0.23359781503677368, "learning_rate": 6.604561972769652e-07, "loss": 0.7839, "num_input_tokens_seen": 36174632, "step": 62710 }, { "epoch": 9.340929401251117, "grad_norm": 0.23742075264453888, "learning_rate": 6.589730821928208e-07, "loss": 0.7861, "num_input_tokens_seen": 36177800, "step": 62715 }, { "epoch": 9.341674113792076, "grad_norm": 0.18195821344852448, "learning_rate": 6.574916119763158e-07, "loss": 0.7834, "num_input_tokens_seen": 36180936, "step": 62720 }, { "epoch": 9.342418826333036, "grad_norm": 0.19018743932247162, "learning_rate": 6.560117867275561e-07, "loss": 0.7756, "num_input_tokens_seen": 36183560, "step": 62725 }, { "epoch": 9.343163538873995, "grad_norm": 0.18992185592651367, "learning_rate": 6.545336065465451e-07, "loss": 0.8206, "num_input_tokens_seen": 36186440, "step": 62730 }, { "epoch": 9.343908251414954, "grad_norm": 0.17570888996124268, "learning_rate": 6.530570715331696e-07, "loss": 0.7808, "num_input_tokens_seen": 36189256, "step": 62735 }, { "epoch": 9.344652963955912, "grad_norm": 0.23548249900341034, "learning_rate": 6.515821817872109e-07, "loss": 0.8134, "num_input_tokens_seen": 36192264, "step": 62740 }, { "epoch": 9.345397676496873, "grad_norm": 0.29735633730888367, "learning_rate": 6.501089374083336e-07, "loss": 0.8022, "num_input_tokens_seen": 36195048, "step": 62745 }, { "epoch": 9.346142389037832, "grad_norm": 0.25660252571105957, "learning_rate": 6.48637338496097e-07, "loss": 0.8088, "num_input_tokens_seen": 36197896, "step": 62750 }, { "epoch": 9.34688710157879, "grad_norm": 0.22645103931427002, "learning_rate": 6.471673851499438e-07, "loss": 0.7565, "num_input_tokens_seen": 36200808, "step": 62755 }, { "epoch": 9.34763181411975, "grad_norm": 0.3095957338809967, "learning_rate": 6.456990774692057e-07, "loss": 0.7888, "num_input_tokens_seen": 36203880, "step": 62760 }, { "epoch": 9.34837652666071, "grad_norm": 0.22201304137706757, "learning_rate": 6.442324155531088e-07, "loss": 0.7889, "num_input_tokens_seen": 36206888, "step": 62765 }, { "epoch": 9.349121239201668, "grad_norm": 0.28099143505096436, "learning_rate": 6.4276739950076e-07, "loss": 0.7935, "num_input_tokens_seen": 36209800, "step": 62770 }, { "epoch": 9.349865951742627, "grad_norm": 0.18552255630493164, "learning_rate": 6.413040294111605e-07, "loss": 0.786, "num_input_tokens_seen": 36212648, "step": 62775 }, { "epoch": 9.350610664283586, "grad_norm": 0.19242334365844727, "learning_rate": 6.398423053832009e-07, "loss": 0.8215, "num_input_tokens_seen": 36215528, "step": 62780 }, { "epoch": 9.351355376824547, "grad_norm": 0.18687301874160767, "learning_rate": 6.383822275156576e-07, "loss": 0.756, "num_input_tokens_seen": 36218120, "step": 62785 }, { "epoch": 9.352100089365505, "grad_norm": 0.1416735202074051, "learning_rate": 6.369237959071933e-07, "loss": 0.7975, "num_input_tokens_seen": 36220808, "step": 62790 }, { "epoch": 9.352844801906464, "grad_norm": 0.22189724445343018, "learning_rate": 6.354670106563681e-07, "loss": 0.7752, "num_input_tokens_seen": 36223944, "step": 62795 }, { "epoch": 9.353589514447423, "grad_norm": 0.2500860095024109, "learning_rate": 6.340118718616228e-07, "loss": 0.791, "num_input_tokens_seen": 36226696, "step": 62800 }, { "epoch": 9.354334226988382, "grad_norm": 0.2417231947183609, "learning_rate": 6.325583796212925e-07, "loss": 0.7762, "num_input_tokens_seen": 36229832, "step": 62805 }, { "epoch": 9.355078939529342, "grad_norm": 0.21951185166835785, "learning_rate": 6.311065340335931e-07, "loss": 0.7802, "num_input_tokens_seen": 36232776, "step": 62810 }, { "epoch": 9.3558236520703, "grad_norm": 0.19500252604484558, "learning_rate": 6.296563351966378e-07, "loss": 0.7898, "num_input_tokens_seen": 36235592, "step": 62815 }, { "epoch": 9.35656836461126, "grad_norm": 0.2109449952840805, "learning_rate": 6.282077832084259e-07, "loss": 0.8085, "num_input_tokens_seen": 36238280, "step": 62820 }, { "epoch": 9.357313077152218, "grad_norm": 0.23507168889045715, "learning_rate": 6.267608781668433e-07, "loss": 0.8231, "num_input_tokens_seen": 36241064, "step": 62825 }, { "epoch": 9.358057789693179, "grad_norm": 0.2019716054201126, "learning_rate": 6.253156201696669e-07, "loss": 0.7841, "num_input_tokens_seen": 36243880, "step": 62830 }, { "epoch": 9.358802502234138, "grad_norm": 0.18134546279907227, "learning_rate": 6.238720093145578e-07, "loss": 0.82, "num_input_tokens_seen": 36246472, "step": 62835 }, { "epoch": 9.359547214775096, "grad_norm": 0.23205381631851196, "learning_rate": 6.22430045699074e-07, "loss": 0.8082, "num_input_tokens_seen": 36249224, "step": 62840 }, { "epoch": 9.360291927316055, "grad_norm": 0.21821735799312592, "learning_rate": 6.20989729420654e-07, "loss": 0.8047, "num_input_tokens_seen": 36251944, "step": 62845 }, { "epoch": 9.361036639857016, "grad_norm": 0.18682461977005005, "learning_rate": 6.195510605766342e-07, "loss": 0.7775, "num_input_tokens_seen": 36254824, "step": 62850 }, { "epoch": 9.361781352397974, "grad_norm": 0.34214603900909424, "learning_rate": 6.181140392642309e-07, "loss": 0.8249, "num_input_tokens_seen": 36257800, "step": 62855 }, { "epoch": 9.362526064938933, "grad_norm": 0.24030129611492157, "learning_rate": 6.166786655805473e-07, "loss": 0.7944, "num_input_tokens_seen": 36260840, "step": 62860 }, { "epoch": 9.363270777479892, "grad_norm": 0.1973806768655777, "learning_rate": 6.152449396225834e-07, "loss": 0.818, "num_input_tokens_seen": 36263752, "step": 62865 }, { "epoch": 9.364015490020853, "grad_norm": 0.2478438764810562, "learning_rate": 6.138128614872258e-07, "loss": 0.7867, "num_input_tokens_seen": 36266440, "step": 62870 }, { "epoch": 9.364760202561811, "grad_norm": 0.2246072143316269, "learning_rate": 6.123824312712494e-07, "loss": 0.8036, "num_input_tokens_seen": 36268968, "step": 62875 }, { "epoch": 9.36550491510277, "grad_norm": 0.20215573906898499, "learning_rate": 6.109536490713136e-07, "loss": 0.7996, "num_input_tokens_seen": 36271560, "step": 62880 }, { "epoch": 9.366249627643729, "grad_norm": 0.6737288236618042, "learning_rate": 6.095265149839769e-07, "loss": 0.8075, "num_input_tokens_seen": 36274696, "step": 62885 }, { "epoch": 9.36699434018469, "grad_norm": 0.2286539375782013, "learning_rate": 6.081010291056705e-07, "loss": 0.7925, "num_input_tokens_seen": 36277288, "step": 62890 }, { "epoch": 9.367739052725648, "grad_norm": 0.3312097489833832, "learning_rate": 6.066771915327257e-07, "loss": 0.7833, "num_input_tokens_seen": 36280328, "step": 62895 }, { "epoch": 9.368483765266607, "grad_norm": 0.2776886224746704, "learning_rate": 6.052550023613601e-07, "loss": 0.8265, "num_input_tokens_seen": 36283112, "step": 62900 }, { "epoch": 9.369228477807566, "grad_norm": 0.18777455389499664, "learning_rate": 6.038344616876801e-07, "loss": 0.7756, "num_input_tokens_seen": 36285928, "step": 62905 }, { "epoch": 9.369973190348526, "grad_norm": 0.23450414836406708, "learning_rate": 6.024155696076784e-07, "loss": 0.8199, "num_input_tokens_seen": 36288712, "step": 62910 }, { "epoch": 9.370717902889485, "grad_norm": 0.16487649083137512, "learning_rate": 6.009983262172392e-07, "loss": 0.7958, "num_input_tokens_seen": 36291368, "step": 62915 }, { "epoch": 9.371462615430444, "grad_norm": 0.2359071671962738, "learning_rate": 5.995827316121388e-07, "loss": 0.7914, "num_input_tokens_seen": 36294280, "step": 62920 }, { "epoch": 9.372207327971402, "grad_norm": 0.2148440182209015, "learning_rate": 5.981687858880258e-07, "loss": 0.7985, "num_input_tokens_seen": 36297000, "step": 62925 }, { "epoch": 9.372952040512363, "grad_norm": 0.19878432154655457, "learning_rate": 5.967564891404626e-07, "loss": 0.8235, "num_input_tokens_seen": 36299784, "step": 62930 }, { "epoch": 9.373696753053322, "grad_norm": 0.23000217974185944, "learning_rate": 5.953458414648755e-07, "loss": 0.8, "num_input_tokens_seen": 36302312, "step": 62935 }, { "epoch": 9.37444146559428, "grad_norm": 0.3388204872608185, "learning_rate": 5.939368429565911e-07, "loss": 0.8063, "num_input_tokens_seen": 36305448, "step": 62940 }, { "epoch": 9.37518617813524, "grad_norm": 0.27469390630722046, "learning_rate": 5.925294937108306e-07, "loss": 0.8239, "num_input_tokens_seen": 36308424, "step": 62945 }, { "epoch": 9.3759308906762, "grad_norm": 0.2720974385738373, "learning_rate": 5.911237938226954e-07, "loss": 0.8208, "num_input_tokens_seen": 36311336, "step": 62950 }, { "epoch": 9.376675603217159, "grad_norm": 0.15627239644527435, "learning_rate": 5.897197433871709e-07, "loss": 0.8065, "num_input_tokens_seen": 36313800, "step": 62955 }, { "epoch": 9.377420315758117, "grad_norm": 0.1506676971912384, "learning_rate": 5.883173424991423e-07, "loss": 0.7861, "num_input_tokens_seen": 36316712, "step": 62960 }, { "epoch": 9.378165028299076, "grad_norm": 0.20011653006076813, "learning_rate": 5.86916591253378e-07, "loss": 0.7812, "num_input_tokens_seen": 36319976, "step": 62965 }, { "epoch": 9.378909740840037, "grad_norm": 0.217153400182724, "learning_rate": 5.855174897445359e-07, "loss": 0.7691, "num_input_tokens_seen": 36323016, "step": 62970 }, { "epoch": 9.379654453380995, "grad_norm": 0.2222556322813034, "learning_rate": 5.841200380671569e-07, "loss": 0.796, "num_input_tokens_seen": 36326024, "step": 62975 }, { "epoch": 9.380399165921954, "grad_norm": 0.17041927576065063, "learning_rate": 5.827242363156793e-07, "loss": 0.7864, "num_input_tokens_seen": 36329224, "step": 62980 }, { "epoch": 9.381143878462913, "grad_norm": 0.20519500970840454, "learning_rate": 5.813300845844249e-07, "loss": 0.8153, "num_input_tokens_seen": 36332360, "step": 62985 }, { "epoch": 9.381888591003872, "grad_norm": 0.3906194865703583, "learning_rate": 5.799375829676018e-07, "loss": 0.8212, "num_input_tokens_seen": 36335016, "step": 62990 }, { "epoch": 9.382633303544832, "grad_norm": 0.17955516278743744, "learning_rate": 5.785467315593124e-07, "loss": 0.8142, "num_input_tokens_seen": 36337768, "step": 62995 }, { "epoch": 9.383378016085791, "grad_norm": 0.20358294248580933, "learning_rate": 5.771575304535453e-07, "loss": 0.8199, "num_input_tokens_seen": 36340680, "step": 63000 }, { "epoch": 9.38412272862675, "grad_norm": 0.21522867679595947, "learning_rate": 5.757699797441757e-07, "loss": 0.789, "num_input_tokens_seen": 36343400, "step": 63005 }, { "epoch": 9.384867441167708, "grad_norm": 0.28683945536613464, "learning_rate": 5.743840795249727e-07, "loss": 0.7817, "num_input_tokens_seen": 36346408, "step": 63010 }, { "epoch": 9.385612153708669, "grad_norm": 0.27093711495399475, "learning_rate": 5.729998298895839e-07, "loss": 0.82, "num_input_tokens_seen": 36349064, "step": 63015 }, { "epoch": 9.386356866249628, "grad_norm": 0.1876637190580368, "learning_rate": 5.716172309315537e-07, "loss": 0.7668, "num_input_tokens_seen": 36351784, "step": 63020 }, { "epoch": 9.387101578790586, "grad_norm": 0.29011067748069763, "learning_rate": 5.702362827443131e-07, "loss": 0.7985, "num_input_tokens_seen": 36354760, "step": 63025 }, { "epoch": 9.387846291331545, "grad_norm": 0.23265206813812256, "learning_rate": 5.68856985421179e-07, "loss": 0.7982, "num_input_tokens_seen": 36357832, "step": 63030 }, { "epoch": 9.388591003872506, "grad_norm": 0.20185035467147827, "learning_rate": 5.674793390553601e-07, "loss": 0.7647, "num_input_tokens_seen": 36360712, "step": 63035 }, { "epoch": 9.389335716413465, "grad_norm": 0.23037640750408173, "learning_rate": 5.661033437399516e-07, "loss": 0.7886, "num_input_tokens_seen": 36363432, "step": 63040 }, { "epoch": 9.390080428954423, "grad_norm": 0.23299440741539001, "learning_rate": 5.647289995679372e-07, "loss": 0.8417, "num_input_tokens_seen": 36366280, "step": 63045 }, { "epoch": 9.390825141495382, "grad_norm": 0.23737765848636627, "learning_rate": 5.633563066321956e-07, "loss": 0.7895, "num_input_tokens_seen": 36368968, "step": 63050 }, { "epoch": 9.391569854036343, "grad_norm": 0.2618856430053711, "learning_rate": 5.619852650254803e-07, "loss": 0.782, "num_input_tokens_seen": 36371912, "step": 63055 }, { "epoch": 9.392314566577301, "grad_norm": 0.17454788088798523, "learning_rate": 5.606158748404423e-07, "loss": 0.806, "num_input_tokens_seen": 36374696, "step": 63060 }, { "epoch": 9.39305927911826, "grad_norm": 0.19167198240756989, "learning_rate": 5.592481361696183e-07, "loss": 0.7862, "num_input_tokens_seen": 36377480, "step": 63065 }, { "epoch": 9.393803991659219, "grad_norm": 0.23464153707027435, "learning_rate": 5.578820491054376e-07, "loss": 0.7931, "num_input_tokens_seen": 36380296, "step": 63070 }, { "epoch": 9.39454870420018, "grad_norm": 0.17578457295894623, "learning_rate": 5.565176137402123e-07, "loss": 0.7922, "num_input_tokens_seen": 36383048, "step": 63075 }, { "epoch": 9.395293416741138, "grad_norm": 0.20156212151050568, "learning_rate": 5.551548301661492e-07, "loss": 0.7963, "num_input_tokens_seen": 36385896, "step": 63080 }, { "epoch": 9.396038129282097, "grad_norm": 0.5116623640060425, "learning_rate": 5.537936984753384e-07, "loss": 0.811, "num_input_tokens_seen": 36389032, "step": 63085 }, { "epoch": 9.396782841823056, "grad_norm": 0.17845956981182098, "learning_rate": 5.524342187597564e-07, "loss": 0.7938, "num_input_tokens_seen": 36392008, "step": 63090 }, { "epoch": 9.397527554364016, "grad_norm": 0.20238935947418213, "learning_rate": 5.510763911112743e-07, "loss": 0.8388, "num_input_tokens_seen": 36394824, "step": 63095 }, { "epoch": 9.398272266904975, "grad_norm": 0.1773703694343567, "learning_rate": 5.497202156216463e-07, "loss": 0.7872, "num_input_tokens_seen": 36397512, "step": 63100 }, { "epoch": 9.399016979445934, "grad_norm": 0.21955806016921997, "learning_rate": 5.483656923825159e-07, "loss": 0.8146, "num_input_tokens_seen": 36400424, "step": 63105 }, { "epoch": 9.399761691986892, "grad_norm": 0.2645917534828186, "learning_rate": 5.470128214854236e-07, "loss": 0.7877, "num_input_tokens_seen": 36403240, "step": 63110 }, { "epoch": 9.400506404527853, "grad_norm": 0.2740606367588043, "learning_rate": 5.456616030217853e-07, "loss": 0.8143, "num_input_tokens_seen": 36406024, "step": 63115 }, { "epoch": 9.401251117068812, "grad_norm": 0.2770591974258423, "learning_rate": 5.443120370829114e-07, "loss": 0.7936, "num_input_tokens_seen": 36409000, "step": 63120 }, { "epoch": 9.40199582960977, "grad_norm": 0.1921168714761734, "learning_rate": 5.429641237599981e-07, "loss": 0.7863, "num_input_tokens_seen": 36411976, "step": 63125 }, { "epoch": 9.40274054215073, "grad_norm": 0.2138839215040207, "learning_rate": 5.416178631441393e-07, "loss": 0.8025, "num_input_tokens_seen": 36414760, "step": 63130 }, { "epoch": 9.40348525469169, "grad_norm": 0.17622163891792297, "learning_rate": 5.402732553263012e-07, "loss": 0.8118, "num_input_tokens_seen": 36417544, "step": 63135 }, { "epoch": 9.404229967232649, "grad_norm": 0.1939355880022049, "learning_rate": 5.389303003973501e-07, "loss": 0.7831, "num_input_tokens_seen": 36420520, "step": 63140 }, { "epoch": 9.404974679773607, "grad_norm": 0.21010960638523102, "learning_rate": 5.375889984480381e-07, "loss": 0.8333, "num_input_tokens_seen": 36423240, "step": 63145 }, { "epoch": 9.405719392314566, "grad_norm": 0.2415589541196823, "learning_rate": 5.362493495690069e-07, "loss": 0.7666, "num_input_tokens_seen": 36426376, "step": 63150 }, { "epoch": 9.406464104855527, "grad_norm": 0.27791884541511536, "learning_rate": 5.349113538507783e-07, "loss": 0.7847, "num_input_tokens_seen": 36429320, "step": 63155 }, { "epoch": 9.407208817396485, "grad_norm": 0.1942000538110733, "learning_rate": 5.335750113837745e-07, "loss": 0.768, "num_input_tokens_seen": 36432360, "step": 63160 }, { "epoch": 9.407953529937444, "grad_norm": 0.17260955274105072, "learning_rate": 5.322403222582984e-07, "loss": 0.7982, "num_input_tokens_seen": 36435496, "step": 63165 }, { "epoch": 9.408698242478403, "grad_norm": 0.20268584787845612, "learning_rate": 5.309072865645442e-07, "loss": 0.7834, "num_input_tokens_seen": 36438248, "step": 63170 }, { "epoch": 9.409442955019362, "grad_norm": 0.21301202476024628, "learning_rate": 5.295759043925902e-07, "loss": 0.8025, "num_input_tokens_seen": 36441288, "step": 63175 }, { "epoch": 9.410187667560322, "grad_norm": 0.1956786960363388, "learning_rate": 5.282461758324058e-07, "loss": 0.7966, "num_input_tokens_seen": 36444200, "step": 63180 }, { "epoch": 9.410932380101281, "grad_norm": 0.6131284832954407, "learning_rate": 5.269181009738527e-07, "loss": 0.7884, "num_input_tokens_seen": 36447400, "step": 63185 }, { "epoch": 9.41167709264224, "grad_norm": 0.26624447107315063, "learning_rate": 5.255916799066729e-07, "loss": 0.8166, "num_input_tokens_seen": 36450600, "step": 63190 }, { "epoch": 9.412421805183198, "grad_norm": 0.18164364993572235, "learning_rate": 5.242669127205002e-07, "loss": 0.7873, "num_input_tokens_seen": 36453448, "step": 63195 }, { "epoch": 9.413166517724159, "grad_norm": 0.33736419677734375, "learning_rate": 5.229437995048603e-07, "loss": 0.8274, "num_input_tokens_seen": 36456168, "step": 63200 }, { "epoch": 9.413911230265118, "grad_norm": 0.2289399951696396, "learning_rate": 5.216223403491593e-07, "loss": 0.8126, "num_input_tokens_seen": 36459336, "step": 63205 }, { "epoch": 9.414655942806077, "grad_norm": 0.23519805073738098, "learning_rate": 5.203025353427038e-07, "loss": 0.8172, "num_input_tokens_seen": 36461992, "step": 63210 }, { "epoch": 9.415400655347035, "grad_norm": 0.20742720365524292, "learning_rate": 5.189843845746723e-07, "loss": 0.8118, "num_input_tokens_seen": 36465192, "step": 63215 }, { "epoch": 9.416145367887996, "grad_norm": 0.2892329692840576, "learning_rate": 5.176678881341435e-07, "loss": 0.801, "num_input_tokens_seen": 36468008, "step": 63220 }, { "epoch": 9.416890080428955, "grad_norm": 0.2584228813648224, "learning_rate": 5.163530461100824e-07, "loss": 0.798, "num_input_tokens_seen": 36471080, "step": 63225 }, { "epoch": 9.417634792969913, "grad_norm": 0.3252069652080536, "learning_rate": 5.150398585913374e-07, "loss": 0.8003, "num_input_tokens_seen": 36473864, "step": 63230 }, { "epoch": 9.418379505510872, "grad_norm": 0.19443677365779877, "learning_rate": 5.137283256666486e-07, "loss": 0.8022, "num_input_tokens_seen": 36476840, "step": 63235 }, { "epoch": 9.419124218051833, "grad_norm": 0.18500734865665436, "learning_rate": 5.12418447424648e-07, "loss": 0.799, "num_input_tokens_seen": 36479560, "step": 63240 }, { "epoch": 9.419868930592791, "grad_norm": 0.1855078637599945, "learning_rate": 5.111102239538479e-07, "loss": 0.8002, "num_input_tokens_seen": 36482024, "step": 63245 }, { "epoch": 9.42061364313375, "grad_norm": 0.2623692750930786, "learning_rate": 5.098036553426583e-07, "loss": 0.8134, "num_input_tokens_seen": 36485256, "step": 63250 }, { "epoch": 9.421358355674709, "grad_norm": 0.18628421425819397, "learning_rate": 5.084987416793669e-07, "loss": 0.8109, "num_input_tokens_seen": 36487912, "step": 63255 }, { "epoch": 9.42210306821567, "grad_norm": 0.17200742661952972, "learning_rate": 5.07195483052153e-07, "loss": 0.8104, "num_input_tokens_seen": 36490600, "step": 63260 }, { "epoch": 9.422847780756628, "grad_norm": 0.2822885811328888, "learning_rate": 5.058938795490881e-07, "loss": 0.7954, "num_input_tokens_seen": 36493352, "step": 63265 }, { "epoch": 9.423592493297587, "grad_norm": 0.22714613378047943, "learning_rate": 5.045939312581294e-07, "loss": 0.7831, "num_input_tokens_seen": 36496296, "step": 63270 }, { "epoch": 9.424337205838546, "grad_norm": 0.25736331939697266, "learning_rate": 5.032956382671206e-07, "loss": 0.8011, "num_input_tokens_seen": 36499080, "step": 63275 }, { "epoch": 9.425081918379506, "grad_norm": 0.14716611802577972, "learning_rate": 5.019990006637998e-07, "loss": 0.8023, "num_input_tokens_seen": 36501736, "step": 63280 }, { "epoch": 9.425826630920465, "grad_norm": 0.2232145071029663, "learning_rate": 5.007040185357803e-07, "loss": 0.802, "num_input_tokens_seen": 36504680, "step": 63285 }, { "epoch": 9.426571343461424, "grad_norm": 0.2501632869243622, "learning_rate": 4.99410691970581e-07, "loss": 0.7974, "num_input_tokens_seen": 36507560, "step": 63290 }, { "epoch": 9.427316056002383, "grad_norm": 0.2707456052303314, "learning_rate": 4.98119021055593e-07, "loss": 0.7919, "num_input_tokens_seen": 36510440, "step": 63295 }, { "epoch": 9.428060768543343, "grad_norm": 0.18091025948524475, "learning_rate": 4.968290058781022e-07, "loss": 0.8166, "num_input_tokens_seen": 36513512, "step": 63300 }, { "epoch": 9.428805481084302, "grad_norm": 0.16703489422798157, "learning_rate": 4.95540646525286e-07, "loss": 0.796, "num_input_tokens_seen": 36516328, "step": 63305 }, { "epoch": 9.42955019362526, "grad_norm": 0.1934719979763031, "learning_rate": 4.942539430842052e-07, "loss": 0.8139, "num_input_tokens_seen": 36519144, "step": 63310 }, { "epoch": 9.43029490616622, "grad_norm": 0.21159517765045166, "learning_rate": 4.929688956418099e-07, "loss": 0.7871, "num_input_tokens_seen": 36521992, "step": 63315 }, { "epoch": 9.43103961870718, "grad_norm": 0.2946634888648987, "learning_rate": 4.916855042849388e-07, "loss": 0.7924, "num_input_tokens_seen": 36525160, "step": 63320 }, { "epoch": 9.431784331248139, "grad_norm": 0.1580602079629898, "learning_rate": 4.904037691003172e-07, "loss": 0.8301, "num_input_tokens_seen": 36527944, "step": 63325 }, { "epoch": 9.432529043789097, "grad_norm": 0.22333642840385437, "learning_rate": 4.891236901745616e-07, "loss": 0.8169, "num_input_tokens_seen": 36531016, "step": 63330 }, { "epoch": 9.433273756330056, "grad_norm": 0.26584237813949585, "learning_rate": 4.878452675941697e-07, "loss": 0.7952, "num_input_tokens_seen": 36533992, "step": 63335 }, { "epoch": 9.434018468871017, "grad_norm": 0.2123432755470276, "learning_rate": 4.865685014455363e-07, "loss": 0.7751, "num_input_tokens_seen": 36536616, "step": 63340 }, { "epoch": 9.434763181411975, "grad_norm": 0.30307677388191223, "learning_rate": 4.852933918149394e-07, "loss": 0.7867, "num_input_tokens_seen": 36539528, "step": 63345 }, { "epoch": 9.435507893952934, "grad_norm": 0.15702952444553375, "learning_rate": 4.840199387885491e-07, "loss": 0.81, "num_input_tokens_seen": 36542344, "step": 63350 }, { "epoch": 9.436252606493893, "grad_norm": 0.2395259290933609, "learning_rate": 4.827481424524133e-07, "loss": 0.8011, "num_input_tokens_seen": 36545224, "step": 63355 }, { "epoch": 9.436997319034852, "grad_norm": 0.31041809916496277, "learning_rate": 4.81478002892477e-07, "loss": 0.8146, "num_input_tokens_seen": 36548264, "step": 63360 }, { "epoch": 9.437742031575812, "grad_norm": 0.2037489116191864, "learning_rate": 4.802095201945745e-07, "loss": 0.8043, "num_input_tokens_seen": 36550984, "step": 63365 }, { "epoch": 9.438486744116771, "grad_norm": 0.2614342272281647, "learning_rate": 4.789426944444231e-07, "loss": 0.8179, "num_input_tokens_seen": 36554056, "step": 63370 }, { "epoch": 9.43923145665773, "grad_norm": 0.281101256608963, "learning_rate": 4.776775257276267e-07, "loss": 0.7937, "num_input_tokens_seen": 36557032, "step": 63375 }, { "epoch": 9.439976169198689, "grad_norm": 0.1757938116788864, "learning_rate": 4.7641401412968357e-07, "loss": 0.8032, "num_input_tokens_seen": 36559912, "step": 63380 }, { "epoch": 9.440720881739649, "grad_norm": 0.3100924491882324, "learning_rate": 4.7515215973597815e-07, "loss": 0.7851, "num_input_tokens_seen": 36562952, "step": 63385 }, { "epoch": 9.441465594280608, "grad_norm": 0.19605916738510132, "learning_rate": 4.738919626317756e-07, "loss": 0.7987, "num_input_tokens_seen": 36565864, "step": 63390 }, { "epoch": 9.442210306821567, "grad_norm": 0.2101118266582489, "learning_rate": 4.726334229022383e-07, "loss": 0.7705, "num_input_tokens_seen": 36568840, "step": 63395 }, { "epoch": 9.442955019362525, "grad_norm": 0.20283164083957672, "learning_rate": 4.71376540632415e-07, "loss": 0.8168, "num_input_tokens_seen": 36571528, "step": 63400 }, { "epoch": 9.443699731903486, "grad_norm": 0.21676898002624512, "learning_rate": 4.7012131590723765e-07, "loss": 0.8341, "num_input_tokens_seen": 36574120, "step": 63405 }, { "epoch": 9.444444444444445, "grad_norm": 0.1990332305431366, "learning_rate": 4.688677488115328e-07, "loss": 0.7908, "num_input_tokens_seen": 36577288, "step": 63410 }, { "epoch": 9.445189156985403, "grad_norm": 0.235639289021492, "learning_rate": 4.676158394300051e-07, "loss": 0.8173, "num_input_tokens_seen": 36580072, "step": 63415 }, { "epoch": 9.445933869526362, "grad_norm": 0.262735515832901, "learning_rate": 4.663655878472617e-07, "loss": 0.8041, "num_input_tokens_seen": 36583144, "step": 63420 }, { "epoch": 9.446678582067323, "grad_norm": 0.2433289736509323, "learning_rate": 4.651169941477851e-07, "loss": 0.8, "num_input_tokens_seen": 36585896, "step": 63425 }, { "epoch": 9.447423294608281, "grad_norm": 0.2353251576423645, "learning_rate": 4.6387005841594943e-07, "loss": 0.8195, "num_input_tokens_seen": 36589032, "step": 63430 }, { "epoch": 9.44816800714924, "grad_norm": 0.2891150414943695, "learning_rate": 4.626247807360151e-07, "loss": 0.8193, "num_input_tokens_seen": 36592072, "step": 63435 }, { "epoch": 9.448912719690199, "grad_norm": 0.29395467042922974, "learning_rate": 4.613811611921398e-07, "loss": 0.7973, "num_input_tokens_seen": 36595080, "step": 63440 }, { "epoch": 9.44965743223116, "grad_norm": 0.30239227414131165, "learning_rate": 4.6013919986836187e-07, "loss": 0.7684, "num_input_tokens_seen": 36598152, "step": 63445 }, { "epoch": 9.450402144772118, "grad_norm": 0.2607392370700836, "learning_rate": 4.588988968486002e-07, "loss": 0.8167, "num_input_tokens_seen": 36601096, "step": 63450 }, { "epoch": 9.451146857313077, "grad_norm": 0.26463836431503296, "learning_rate": 4.5766025221667674e-07, "loss": 0.823, "num_input_tokens_seen": 36604008, "step": 63455 }, { "epoch": 9.451891569854036, "grad_norm": 0.240153506398201, "learning_rate": 4.5642326605629116e-07, "loss": 0.8082, "num_input_tokens_seen": 36606888, "step": 63460 }, { "epoch": 9.452636282394996, "grad_norm": 0.3121110200881958, "learning_rate": 4.5518793845103215e-07, "loss": 0.7884, "num_input_tokens_seen": 36609928, "step": 63465 }, { "epoch": 9.453380994935955, "grad_norm": 0.2285519391298294, "learning_rate": 4.539542694843829e-07, "loss": 0.7828, "num_input_tokens_seen": 36612712, "step": 63470 }, { "epoch": 9.454125707476914, "grad_norm": 0.23327793180942535, "learning_rate": 4.527222592397046e-07, "loss": 0.8025, "num_input_tokens_seen": 36615400, "step": 63475 }, { "epoch": 9.454870420017873, "grad_norm": 0.19832713901996613, "learning_rate": 4.514919078002583e-07, "loss": 0.8364, "num_input_tokens_seen": 36618344, "step": 63480 }, { "epoch": 9.455615132558833, "grad_norm": 0.20809878408908844, "learning_rate": 4.502632152491776e-07, "loss": 0.8126, "num_input_tokens_seen": 36621352, "step": 63485 }, { "epoch": 9.456359845099792, "grad_norm": 0.17471939325332642, "learning_rate": 4.490361816694988e-07, "loss": 0.8118, "num_input_tokens_seen": 36624264, "step": 63490 }, { "epoch": 9.45710455764075, "grad_norm": 0.24643371999263763, "learning_rate": 4.478108071441389e-07, "loss": 0.7934, "num_input_tokens_seen": 36626952, "step": 63495 }, { "epoch": 9.45784927018171, "grad_norm": 0.1929282397031784, "learning_rate": 4.4658709175590116e-07, "loss": 0.7773, "num_input_tokens_seen": 36629672, "step": 63500 }, { "epoch": 9.458593982722668, "grad_norm": 0.22012577950954437, "learning_rate": 4.4536503558748057e-07, "loss": 0.7762, "num_input_tokens_seen": 36632488, "step": 63505 }, { "epoch": 9.459338695263629, "grad_norm": 0.28392407298088074, "learning_rate": 4.441446387214582e-07, "loss": 0.7996, "num_input_tokens_seen": 36635112, "step": 63510 }, { "epoch": 9.460083407804587, "grad_norm": 0.19215482473373413, "learning_rate": 4.4292590124030697e-07, "loss": 0.7979, "num_input_tokens_seen": 36637800, "step": 63515 }, { "epoch": 9.460828120345546, "grad_norm": 0.23637133836746216, "learning_rate": 4.4170882322638053e-07, "loss": 0.8105, "num_input_tokens_seen": 36640616, "step": 63520 }, { "epoch": 9.461572832886505, "grad_norm": 0.3914792835712433, "learning_rate": 4.4049340476192414e-07, "loss": 0.8377, "num_input_tokens_seen": 36643816, "step": 63525 }, { "epoch": 9.462317545427466, "grad_norm": 0.21009282767772675, "learning_rate": 4.392796459290721e-07, "loss": 0.7821, "num_input_tokens_seen": 36646536, "step": 63530 }, { "epoch": 9.463062257968424, "grad_norm": 0.28394967317581177, "learning_rate": 4.380675468098477e-07, "loss": 0.811, "num_input_tokens_seen": 36649128, "step": 63535 }, { "epoch": 9.463806970509383, "grad_norm": 0.24034129083156586, "learning_rate": 4.3685710748615493e-07, "loss": 0.8076, "num_input_tokens_seen": 36651976, "step": 63540 }, { "epoch": 9.464551683050342, "grad_norm": 0.33275753259658813, "learning_rate": 4.356483280397894e-07, "loss": 0.7817, "num_input_tokens_seen": 36654920, "step": 63545 }, { "epoch": 9.465296395591302, "grad_norm": 0.25161370635032654, "learning_rate": 4.344412085524441e-07, "loss": 0.8123, "num_input_tokens_seen": 36657832, "step": 63550 }, { "epoch": 9.466041108132261, "grad_norm": 0.29335132241249084, "learning_rate": 4.3323574910568157e-07, "loss": 0.7912, "num_input_tokens_seen": 36660712, "step": 63555 }, { "epoch": 9.46678582067322, "grad_norm": 0.1703321784734726, "learning_rate": 4.320319497809672e-07, "loss": 0.8005, "num_input_tokens_seen": 36663432, "step": 63560 }, { "epoch": 9.467530533214179, "grad_norm": 0.15589769184589386, "learning_rate": 4.30829810659647e-07, "loss": 0.807, "num_input_tokens_seen": 36666344, "step": 63565 }, { "epoch": 9.46827524575514, "grad_norm": 0.2431599646806717, "learning_rate": 4.2962933182295606e-07, "loss": 0.802, "num_input_tokens_seen": 36669128, "step": 63570 }, { "epoch": 9.469019958296098, "grad_norm": 0.38148900866508484, "learning_rate": 4.2843051335202386e-07, "loss": 0.7995, "num_input_tokens_seen": 36672104, "step": 63575 }, { "epoch": 9.469764670837057, "grad_norm": 0.2948184311389923, "learning_rate": 4.2723335532785235e-07, "loss": 0.825, "num_input_tokens_seen": 36674984, "step": 63580 }, { "epoch": 9.470509383378015, "grad_norm": 0.24463459849357605, "learning_rate": 4.26037857831349e-07, "loss": 0.7852, "num_input_tokens_seen": 36678408, "step": 63585 }, { "epoch": 9.471254095918976, "grad_norm": 0.2250058650970459, "learning_rate": 4.2484402094329354e-07, "loss": 0.8115, "num_input_tokens_seen": 36681352, "step": 63590 }, { "epoch": 9.471998808459935, "grad_norm": 0.27574780583381653, "learning_rate": 4.2365184474436327e-07, "loss": 0.8072, "num_input_tokens_seen": 36684552, "step": 63595 }, { "epoch": 9.472743521000893, "grad_norm": 0.1868390440940857, "learning_rate": 4.224613293151214e-07, "loss": 0.8247, "num_input_tokens_seen": 36687304, "step": 63600 }, { "epoch": 9.473488233541852, "grad_norm": 0.28559431433677673, "learning_rate": 4.212724747360175e-07, "loss": 0.7952, "num_input_tokens_seen": 36690344, "step": 63605 }, { "epoch": 9.474232946082813, "grad_norm": 0.2596268653869629, "learning_rate": 4.2008528108739287e-07, "loss": 0.7774, "num_input_tokens_seen": 36692936, "step": 63610 }, { "epoch": 9.474977658623772, "grad_norm": 0.17538131773471832, "learning_rate": 4.1889974844946947e-07, "loss": 0.7763, "num_input_tokens_seen": 36695816, "step": 63615 }, { "epoch": 9.47572237116473, "grad_norm": 0.2441609650850296, "learning_rate": 4.177158769023609e-07, "loss": 0.7885, "num_input_tokens_seen": 36699208, "step": 63620 }, { "epoch": 9.476467083705689, "grad_norm": 0.18177226185798645, "learning_rate": 4.1653366652607e-07, "loss": 0.7992, "num_input_tokens_seen": 36701896, "step": 63625 }, { "epoch": 9.47721179624665, "grad_norm": 0.20824328064918518, "learning_rate": 4.153531174004827e-07, "loss": 0.7918, "num_input_tokens_seen": 36704680, "step": 63630 }, { "epoch": 9.477956508787608, "grad_norm": 0.23638078570365906, "learning_rate": 4.141742296053769e-07, "loss": 0.8009, "num_input_tokens_seen": 36707528, "step": 63635 }, { "epoch": 9.478701221328567, "grad_norm": 0.3900686800479889, "learning_rate": 4.1299700322041945e-07, "loss": 0.795, "num_input_tokens_seen": 36710600, "step": 63640 }, { "epoch": 9.479445933869526, "grad_norm": 0.18637871742248535, "learning_rate": 4.118214383251634e-07, "loss": 0.8074, "num_input_tokens_seen": 36713544, "step": 63645 }, { "epoch": 9.480190646410486, "grad_norm": 0.3446192741394043, "learning_rate": 4.106475349990452e-07, "loss": 0.7981, "num_input_tokens_seen": 36716616, "step": 63650 }, { "epoch": 9.480935358951445, "grad_norm": 0.2242102324962616, "learning_rate": 4.09475293321393e-07, "loss": 0.819, "num_input_tokens_seen": 36719528, "step": 63655 }, { "epoch": 9.481680071492404, "grad_norm": 0.2508523762226105, "learning_rate": 4.0830471337142407e-07, "loss": 0.799, "num_input_tokens_seen": 36722440, "step": 63660 }, { "epoch": 9.482424784033363, "grad_norm": 0.29290464520454407, "learning_rate": 4.071357952282362e-07, "loss": 0.7908, "num_input_tokens_seen": 36725352, "step": 63665 }, { "epoch": 9.483169496574323, "grad_norm": 0.2955452501773834, "learning_rate": 4.059685389708273e-07, "loss": 0.8004, "num_input_tokens_seen": 36728168, "step": 63670 }, { "epoch": 9.483914209115282, "grad_norm": 0.34136298298835754, "learning_rate": 4.048029446780704e-07, "loss": 0.8501, "num_input_tokens_seen": 36730856, "step": 63675 }, { "epoch": 9.48465892165624, "grad_norm": 0.16333384811878204, "learning_rate": 4.0363901242873594e-07, "loss": 0.8103, "num_input_tokens_seen": 36733608, "step": 63680 }, { "epoch": 9.4854036341972, "grad_norm": 0.22116880118846893, "learning_rate": 4.0247674230147467e-07, "loss": 0.7657, "num_input_tokens_seen": 36736360, "step": 63685 }, { "epoch": 9.486148346738158, "grad_norm": 0.19343838095664978, "learning_rate": 4.0131613437482674e-07, "loss": 0.8132, "num_input_tokens_seen": 36739048, "step": 63690 }, { "epoch": 9.486893059279119, "grad_norm": 0.2046148180961609, "learning_rate": 4.001571887272293e-07, "loss": 0.7908, "num_input_tokens_seen": 36741864, "step": 63695 }, { "epoch": 9.487637771820078, "grad_norm": 0.15473252534866333, "learning_rate": 3.989999054369864e-07, "loss": 0.7769, "num_input_tokens_seen": 36744776, "step": 63700 }, { "epoch": 9.488382484361036, "grad_norm": 0.2289755791425705, "learning_rate": 3.978442845823133e-07, "loss": 0.8201, "num_input_tokens_seen": 36747624, "step": 63705 }, { "epoch": 9.489127196901995, "grad_norm": 0.21163460612297058, "learning_rate": 3.966903262412974e-07, "loss": 0.7948, "num_input_tokens_seen": 36750216, "step": 63710 }, { "epoch": 9.489871909442956, "grad_norm": 0.22364819049835205, "learning_rate": 3.9553803049192096e-07, "loss": 0.7987, "num_input_tokens_seen": 36752872, "step": 63715 }, { "epoch": 9.490616621983914, "grad_norm": 0.3595868945121765, "learning_rate": 3.9438739741204935e-07, "loss": 0.8048, "num_input_tokens_seen": 36755912, "step": 63720 }, { "epoch": 9.491361334524873, "grad_norm": 0.29685696959495544, "learning_rate": 3.9323842707943703e-07, "loss": 0.7773, "num_input_tokens_seen": 36758696, "step": 63725 }, { "epoch": 9.492106047065832, "grad_norm": 0.29428067803382874, "learning_rate": 3.920911195717275e-07, "loss": 0.8002, "num_input_tokens_seen": 36761416, "step": 63730 }, { "epoch": 9.492850759606792, "grad_norm": 0.25103092193603516, "learning_rate": 3.909454749664532e-07, "loss": 0.8107, "num_input_tokens_seen": 36764040, "step": 63735 }, { "epoch": 9.493595472147751, "grad_norm": 0.22492846846580505, "learning_rate": 3.898014933410299e-07, "loss": 0.8136, "num_input_tokens_seen": 36766760, "step": 63740 }, { "epoch": 9.49434018468871, "grad_norm": 0.2707516551017761, "learning_rate": 3.8865917477276527e-07, "loss": 0.8042, "num_input_tokens_seen": 36769352, "step": 63745 }, { "epoch": 9.495084897229669, "grad_norm": 0.2102973312139511, "learning_rate": 3.875185193388503e-07, "loss": 0.812, "num_input_tokens_seen": 36772200, "step": 63750 }, { "epoch": 9.49582960977063, "grad_norm": 0.2034752070903778, "learning_rate": 3.8637952711636504e-07, "loss": 0.8146, "num_input_tokens_seen": 36774760, "step": 63755 }, { "epoch": 9.496574322311588, "grad_norm": 0.27902889251708984, "learning_rate": 3.8524219818228123e-07, "loss": 0.7725, "num_input_tokens_seen": 36777672, "step": 63760 }, { "epoch": 9.497319034852547, "grad_norm": 0.24692624807357788, "learning_rate": 3.8410653261345407e-07, "loss": 0.8193, "num_input_tokens_seen": 36780712, "step": 63765 }, { "epoch": 9.498063747393505, "grad_norm": 0.18800337612628937, "learning_rate": 3.8297253048662494e-07, "loss": 0.7792, "num_input_tokens_seen": 36783464, "step": 63770 }, { "epoch": 9.498808459934466, "grad_norm": 0.22385060787200928, "learning_rate": 3.818401918784298e-07, "loss": 0.834, "num_input_tokens_seen": 36785992, "step": 63775 }, { "epoch": 9.499553172475425, "grad_norm": 0.16463184356689453, "learning_rate": 3.807095168653796e-07, "loss": 0.7526, "num_input_tokens_seen": 36789032, "step": 63780 }, { "epoch": 9.5, "eval_loss": 0.8028320670127869, "eval_runtime": 45.2152, "eval_samples_per_second": 65.995, "eval_steps_per_second": 16.499, "num_input_tokens_seen": 36790952, "step": 63783 }, { "epoch": 9.500297885016384, "grad_norm": 0.2510003447532654, "learning_rate": 3.7958050552389104e-07, "loss": 0.7989, "num_input_tokens_seen": 36792136, "step": 63785 }, { "epoch": 9.501042597557342, "grad_norm": 0.20244263112545013, "learning_rate": 3.784531579302475e-07, "loss": 0.7931, "num_input_tokens_seen": 36795016, "step": 63790 }, { "epoch": 9.501787310098303, "grad_norm": 0.2723853588104248, "learning_rate": 3.7732747416063805e-07, "loss": 0.7758, "num_input_tokens_seen": 36797832, "step": 63795 }, { "epoch": 9.502532022639262, "grad_norm": 0.2086350917816162, "learning_rate": 3.762034542911269e-07, "loss": 0.7878, "num_input_tokens_seen": 36800616, "step": 63800 }, { "epoch": 9.50327673518022, "grad_norm": 0.20852695405483246, "learning_rate": 3.7508109839767546e-07, "loss": 0.802, "num_input_tokens_seen": 36803464, "step": 63805 }, { "epoch": 9.504021447721179, "grad_norm": 0.15622209012508392, "learning_rate": 3.7396040655612587e-07, "loss": 0.7807, "num_input_tokens_seen": 36806376, "step": 63810 }, { "epoch": 9.50476616026214, "grad_norm": 0.28639477491378784, "learning_rate": 3.728413788422119e-07, "loss": 0.7829, "num_input_tokens_seen": 36809352, "step": 63815 }, { "epoch": 9.505510872803098, "grad_norm": 0.2585262954235077, "learning_rate": 3.7172401533154823e-07, "loss": 0.7864, "num_input_tokens_seen": 36812072, "step": 63820 }, { "epoch": 9.506255585344057, "grad_norm": 0.34992459416389465, "learning_rate": 3.706083160996437e-07, "loss": 0.8168, "num_input_tokens_seen": 36815176, "step": 63825 }, { "epoch": 9.507000297885016, "grad_norm": 0.22517329454421997, "learning_rate": 3.6949428122189375e-07, "loss": 0.7907, "num_input_tokens_seen": 36818056, "step": 63830 }, { "epoch": 9.507745010425975, "grad_norm": 0.1914953738451004, "learning_rate": 3.6838191077357975e-07, "loss": 0.7949, "num_input_tokens_seen": 36821032, "step": 63835 }, { "epoch": 9.508489722966935, "grad_norm": 0.38726699352264404, "learning_rate": 3.672712048298721e-07, "loss": 0.7879, "num_input_tokens_seen": 36824008, "step": 63840 }, { "epoch": 9.509234435507894, "grad_norm": 0.22492650151252747, "learning_rate": 3.661621634658274e-07, "loss": 0.8127, "num_input_tokens_seen": 36826888, "step": 63845 }, { "epoch": 9.509979148048853, "grad_norm": 0.2137618213891983, "learning_rate": 3.650547867563886e-07, "loss": 0.8014, "num_input_tokens_seen": 36830056, "step": 63850 }, { "epoch": 9.510723860589813, "grad_norm": 0.23101134598255157, "learning_rate": 3.6394907477639294e-07, "loss": 0.7978, "num_input_tokens_seen": 36832968, "step": 63855 }, { "epoch": 9.511468573130772, "grad_norm": 0.1999993771314621, "learning_rate": 3.628450276005502e-07, "loss": 0.7964, "num_input_tokens_seen": 36835784, "step": 63860 }, { "epoch": 9.51221328567173, "grad_norm": 0.21533094346523285, "learning_rate": 3.6174264530347557e-07, "loss": 0.8173, "num_input_tokens_seen": 36838536, "step": 63865 }, { "epoch": 9.51295799821269, "grad_norm": 0.5741114616394043, "learning_rate": 3.6064192795965956e-07, "loss": 0.7887, "num_input_tokens_seen": 36841736, "step": 63870 }, { "epoch": 9.513702710753648, "grad_norm": 0.25433358550071716, "learning_rate": 3.595428756434871e-07, "loss": 0.7814, "num_input_tokens_seen": 36844392, "step": 63875 }, { "epoch": 9.514447423294609, "grad_norm": 0.3058772087097168, "learning_rate": 3.584454884292293e-07, "loss": 0.7846, "num_input_tokens_seen": 36847272, "step": 63880 }, { "epoch": 9.515192135835568, "grad_norm": 0.25880715250968933, "learning_rate": 3.5734976639103525e-07, "loss": 0.7927, "num_input_tokens_seen": 36850056, "step": 63885 }, { "epoch": 9.515936848376526, "grad_norm": 0.36095279455184937, "learning_rate": 3.5625570960295674e-07, "loss": 0.77, "num_input_tokens_seen": 36853128, "step": 63890 }, { "epoch": 9.516681560917485, "grad_norm": 0.21195828914642334, "learning_rate": 3.5516331813892355e-07, "loss": 0.8038, "num_input_tokens_seen": 36856104, "step": 63895 }, { "epoch": 9.517426273458446, "grad_norm": 0.23517856001853943, "learning_rate": 3.5407259207275444e-07, "loss": 0.8104, "num_input_tokens_seen": 36859112, "step": 63900 }, { "epoch": 9.518170985999404, "grad_norm": 0.3266381323337555, "learning_rate": 3.529835314781543e-07, "loss": 0.8059, "num_input_tokens_seen": 36862376, "step": 63905 }, { "epoch": 9.518915698540363, "grad_norm": 0.28131839632987976, "learning_rate": 3.5189613642872264e-07, "loss": 0.8105, "num_input_tokens_seen": 36865512, "step": 63910 }, { "epoch": 9.519660411081322, "grad_norm": 0.2534675896167755, "learning_rate": 3.508104069979368e-07, "loss": 0.85, "num_input_tokens_seen": 36868488, "step": 63915 }, { "epoch": 9.520405123622282, "grad_norm": 0.17943458259105682, "learning_rate": 3.4972634325916854e-07, "loss": 0.8162, "num_input_tokens_seen": 36871240, "step": 63920 }, { "epoch": 9.521149836163241, "grad_norm": 0.20771561563014984, "learning_rate": 3.486439452856705e-07, "loss": 0.8043, "num_input_tokens_seen": 36874120, "step": 63925 }, { "epoch": 9.5218945487042, "grad_norm": 0.20635010302066803, "learning_rate": 3.4756321315058957e-07, "loss": 0.7912, "num_input_tokens_seen": 36877128, "step": 63930 }, { "epoch": 9.522639261245159, "grad_norm": 0.2084212303161621, "learning_rate": 3.4648414692696196e-07, "loss": 0.7837, "num_input_tokens_seen": 36879976, "step": 63935 }, { "epoch": 9.52338397378612, "grad_norm": 0.2018941044807434, "learning_rate": 3.4540674668769866e-07, "loss": 0.8088, "num_input_tokens_seen": 36882760, "step": 63940 }, { "epoch": 9.524128686327078, "grad_norm": 0.24066424369812012, "learning_rate": 3.443310125056082e-07, "loss": 0.7984, "num_input_tokens_seen": 36885672, "step": 63945 }, { "epoch": 9.524873398868037, "grad_norm": 0.2899740934371948, "learning_rate": 3.4325694445338783e-07, "loss": 0.8126, "num_input_tokens_seen": 36888456, "step": 63950 }, { "epoch": 9.525618111408996, "grad_norm": 0.23347720503807068, "learning_rate": 3.42184542603613e-07, "loss": 0.8194, "num_input_tokens_seen": 36891496, "step": 63955 }, { "epoch": 9.526362823949956, "grad_norm": 0.19329802691936493, "learning_rate": 3.411138070287562e-07, "loss": 0.806, "num_input_tokens_seen": 36894376, "step": 63960 }, { "epoch": 9.527107536490915, "grad_norm": 0.2337370216846466, "learning_rate": 3.400447378011734e-07, "loss": 0.7916, "num_input_tokens_seen": 36897128, "step": 63965 }, { "epoch": 9.527852249031874, "grad_norm": 0.20064902305603027, "learning_rate": 3.389773349931069e-07, "loss": 0.7816, "num_input_tokens_seen": 36900040, "step": 63970 }, { "epoch": 9.528596961572832, "grad_norm": 0.24845685064792633, "learning_rate": 3.3791159867668786e-07, "loss": 0.8141, "num_input_tokens_seen": 36902984, "step": 63975 }, { "epoch": 9.529341674113793, "grad_norm": 0.20617792010307312, "learning_rate": 3.3684752892393643e-07, "loss": 0.7842, "num_input_tokens_seen": 36906152, "step": 63980 }, { "epoch": 9.530086386654752, "grad_norm": 0.3028528094291687, "learning_rate": 3.357851258067535e-07, "loss": 0.8059, "num_input_tokens_seen": 36909000, "step": 63985 }, { "epoch": 9.53083109919571, "grad_norm": 0.24653545022010803, "learning_rate": 3.347243893969343e-07, "loss": 0.7983, "num_input_tokens_seen": 36911848, "step": 63990 }, { "epoch": 9.53157581173667, "grad_norm": 0.2446807473897934, "learning_rate": 3.3366531976615767e-07, "loss": 0.7893, "num_input_tokens_seen": 36914824, "step": 63995 }, { "epoch": 9.53232052427763, "grad_norm": 0.19226963818073273, "learning_rate": 3.326079169859941e-07, "loss": 0.81, "num_input_tokens_seen": 36917384, "step": 64000 }, { "epoch": 9.533065236818588, "grad_norm": 0.1574273705482483, "learning_rate": 3.3155218112789763e-07, "loss": 0.805, "num_input_tokens_seen": 36920488, "step": 64005 }, { "epoch": 9.533809949359547, "grad_norm": 0.1705329269170761, "learning_rate": 3.3049811226321113e-07, "loss": 0.8015, "num_input_tokens_seen": 36923688, "step": 64010 }, { "epoch": 9.534554661900506, "grad_norm": 0.29104313254356384, "learning_rate": 3.2944571046316373e-07, "loss": 0.7914, "num_input_tokens_seen": 36926728, "step": 64015 }, { "epoch": 9.535299374441465, "grad_norm": 0.18376542627811432, "learning_rate": 3.283949757988708e-07, "loss": 0.829, "num_input_tokens_seen": 36929480, "step": 64020 }, { "epoch": 9.536044086982425, "grad_norm": 0.2921680808067322, "learning_rate": 3.273459083413366e-07, "loss": 0.771, "num_input_tokens_seen": 36932680, "step": 64025 }, { "epoch": 9.536788799523384, "grad_norm": 0.25524818897247314, "learning_rate": 3.2629850816145723e-07, "loss": 0.7822, "num_input_tokens_seen": 36935688, "step": 64030 }, { "epoch": 9.537533512064343, "grad_norm": 0.30485543608665466, "learning_rate": 3.2525277533000667e-07, "loss": 0.7894, "num_input_tokens_seen": 36938664, "step": 64035 }, { "epoch": 9.538278224605303, "grad_norm": 0.24306906759738922, "learning_rate": 3.242087099176533e-07, "loss": 0.8228, "num_input_tokens_seen": 36941736, "step": 64040 }, { "epoch": 9.539022937146262, "grad_norm": 0.1550278216600418, "learning_rate": 3.2316631199495186e-07, "loss": 0.8136, "num_input_tokens_seen": 36944680, "step": 64045 }, { "epoch": 9.53976764968722, "grad_norm": 0.2250126451253891, "learning_rate": 3.2212558163234043e-07, "loss": 0.7801, "num_input_tokens_seen": 36947496, "step": 64050 }, { "epoch": 9.54051236222818, "grad_norm": 0.32585665583610535, "learning_rate": 3.2108651890014884e-07, "loss": 0.8264, "num_input_tokens_seen": 36950504, "step": 64055 }, { "epoch": 9.541257074769138, "grad_norm": 0.21874293684959412, "learning_rate": 3.20049123868596e-07, "loss": 0.819, "num_input_tokens_seen": 36953160, "step": 64060 }, { "epoch": 9.542001787310099, "grad_norm": 0.31634825468063354, "learning_rate": 3.1901339660778127e-07, "loss": 0.808, "num_input_tokens_seen": 36956072, "step": 64065 }, { "epoch": 9.542746499851058, "grad_norm": 0.25880303978919983, "learning_rate": 3.17979337187696e-07, "loss": 0.8244, "num_input_tokens_seen": 36959080, "step": 64070 }, { "epoch": 9.543491212392016, "grad_norm": 0.17860862612724304, "learning_rate": 3.169469456782148e-07, "loss": 0.7955, "num_input_tokens_seen": 36961960, "step": 64075 }, { "epoch": 9.544235924932975, "grad_norm": 0.2160549759864807, "learning_rate": 3.1591622214910686e-07, "loss": 0.7828, "num_input_tokens_seen": 36964680, "step": 64080 }, { "epoch": 9.544980637473936, "grad_norm": 0.28803545236587524, "learning_rate": 3.1488716667002204e-07, "loss": 0.8083, "num_input_tokens_seen": 36967720, "step": 64085 }, { "epoch": 9.545725350014894, "grad_norm": 0.17680023610591888, "learning_rate": 3.138597793105019e-07, "loss": 0.7626, "num_input_tokens_seen": 36970760, "step": 64090 }, { "epoch": 9.546470062555853, "grad_norm": 0.2140069454908371, "learning_rate": 3.1283406013996874e-07, "loss": 0.7957, "num_input_tokens_seen": 36973864, "step": 64095 }, { "epoch": 9.547214775096812, "grad_norm": 0.17998172342777252, "learning_rate": 3.118100092277421e-07, "loss": 0.7727, "num_input_tokens_seen": 36976840, "step": 64100 }, { "epoch": 9.547959487637772, "grad_norm": 0.24400779604911804, "learning_rate": 3.1078762664301655e-07, "loss": 0.7942, "num_input_tokens_seen": 36979784, "step": 64105 }, { "epoch": 9.548704200178731, "grad_norm": 0.2417972832918167, "learning_rate": 3.097669124548869e-07, "loss": 0.7891, "num_input_tokens_seen": 36982760, "step": 64110 }, { "epoch": 9.54944891271969, "grad_norm": 0.398499459028244, "learning_rate": 3.087478667323257e-07, "loss": 0.8088, "num_input_tokens_seen": 36985544, "step": 64115 }, { "epoch": 9.550193625260649, "grad_norm": 0.2640843391418457, "learning_rate": 3.0773048954419457e-07, "loss": 0.818, "num_input_tokens_seen": 36988456, "step": 64120 }, { "epoch": 9.55093833780161, "grad_norm": 0.2047978639602661, "learning_rate": 3.0671478095924687e-07, "loss": 0.7879, "num_input_tokens_seen": 36991112, "step": 64125 }, { "epoch": 9.551683050342568, "grad_norm": 0.16822946071624756, "learning_rate": 3.057007410461166e-07, "loss": 0.8044, "num_input_tokens_seen": 36993960, "step": 64130 }, { "epoch": 9.552427762883527, "grad_norm": 0.2353556901216507, "learning_rate": 3.046883698733322e-07, "loss": 0.7945, "num_input_tokens_seen": 36996680, "step": 64135 }, { "epoch": 9.553172475424486, "grad_norm": 0.19674406945705414, "learning_rate": 3.036776675093056e-07, "loss": 0.8038, "num_input_tokens_seen": 36999624, "step": 64140 }, { "epoch": 9.553917187965446, "grad_norm": 0.26875635981559753, "learning_rate": 3.02668634022335e-07, "loss": 0.7985, "num_input_tokens_seen": 37002408, "step": 64145 }, { "epoch": 9.554661900506405, "grad_norm": 0.2440677285194397, "learning_rate": 3.016612694806048e-07, "loss": 0.8172, "num_input_tokens_seen": 37005448, "step": 64150 }, { "epoch": 9.555406613047364, "grad_norm": 0.2026529461145401, "learning_rate": 3.0065557395218825e-07, "loss": 0.8474, "num_input_tokens_seen": 37008424, "step": 64155 }, { "epoch": 9.556151325588322, "grad_norm": 0.1843930333852768, "learning_rate": 2.9965154750504764e-07, "loss": 0.801, "num_input_tokens_seen": 37010952, "step": 64160 }, { "epoch": 9.556896038129283, "grad_norm": 0.30088284611701965, "learning_rate": 2.9864919020703155e-07, "loss": 0.7834, "num_input_tokens_seen": 37013864, "step": 64165 }, { "epoch": 9.557640750670242, "grad_norm": 0.2747083306312561, "learning_rate": 2.976485021258746e-07, "loss": 0.8141, "num_input_tokens_seen": 37016584, "step": 64170 }, { "epoch": 9.5583854632112, "grad_norm": 0.25607049465179443, "learning_rate": 2.966494833292005e-07, "loss": 0.7894, "num_input_tokens_seen": 37019528, "step": 64175 }, { "epoch": 9.55913017575216, "grad_norm": 0.2578737735748291, "learning_rate": 2.9565213388451917e-07, "loss": 0.8025, "num_input_tokens_seen": 37022600, "step": 64180 }, { "epoch": 9.55987488829312, "grad_norm": 0.2172570675611496, "learning_rate": 2.9465645385922394e-07, "loss": 0.7998, "num_input_tokens_seen": 37025736, "step": 64185 }, { "epoch": 9.560619600834078, "grad_norm": 0.24821092188358307, "learning_rate": 2.9366244332060257e-07, "loss": 0.7868, "num_input_tokens_seen": 37028776, "step": 64190 }, { "epoch": 9.561364313375037, "grad_norm": 0.21713045239448547, "learning_rate": 2.926701023358208e-07, "loss": 0.8002, "num_input_tokens_seen": 37031496, "step": 64195 }, { "epoch": 9.562109025915996, "grad_norm": 0.22010985016822815, "learning_rate": 2.916794309719445e-07, "loss": 0.7801, "num_input_tokens_seen": 37034440, "step": 64200 }, { "epoch": 9.562853738456955, "grad_norm": 0.1623249053955078, "learning_rate": 2.906904292959145e-07, "loss": 0.811, "num_input_tokens_seen": 37037256, "step": 64205 }, { "epoch": 9.563598450997915, "grad_norm": 0.21423593163490295, "learning_rate": 2.8970309737456625e-07, "loss": 0.7986, "num_input_tokens_seen": 37040072, "step": 64210 }, { "epoch": 9.564343163538874, "grad_norm": 0.21270117163658142, "learning_rate": 2.8871743527461583e-07, "loss": 0.8074, "num_input_tokens_seen": 37042984, "step": 64215 }, { "epoch": 9.565087876079833, "grad_norm": 0.21265393495559692, "learning_rate": 2.877334430626738e-07, "loss": 0.7907, "num_input_tokens_seen": 37045768, "step": 64220 }, { "epoch": 9.565832588620792, "grad_norm": 0.23061048984527588, "learning_rate": 2.867511208052315e-07, "loss": 0.8134, "num_input_tokens_seen": 37048456, "step": 64225 }, { "epoch": 9.566577301161752, "grad_norm": 0.23838306963443756, "learning_rate": 2.857704685686718e-07, "loss": 0.7834, "num_input_tokens_seen": 37051208, "step": 64230 }, { "epoch": 9.56732201370271, "grad_norm": 0.23923902213573456, "learning_rate": 2.8479148641926134e-07, "loss": 0.7894, "num_input_tokens_seen": 37053992, "step": 64235 }, { "epoch": 9.56806672624367, "grad_norm": 0.21064575016498566, "learning_rate": 2.8381417442316093e-07, "loss": 0.8099, "num_input_tokens_seen": 37056840, "step": 64240 }, { "epoch": 9.568811438784628, "grad_norm": 0.25596681237220764, "learning_rate": 2.8283853264640947e-07, "loss": 0.7891, "num_input_tokens_seen": 37059944, "step": 64245 }, { "epoch": 9.569556151325589, "grad_norm": 0.1751238852739334, "learning_rate": 2.8186456115493475e-07, "loss": 0.7799, "num_input_tokens_seen": 37063112, "step": 64250 }, { "epoch": 9.570300863866548, "grad_norm": 0.26993295550346375, "learning_rate": 2.8089226001455913e-07, "loss": 0.7785, "num_input_tokens_seen": 37066216, "step": 64255 }, { "epoch": 9.571045576407506, "grad_norm": 0.22258931398391724, "learning_rate": 2.799216292909829e-07, "loss": 0.8145, "num_input_tokens_seen": 37068904, "step": 64260 }, { "epoch": 9.571790288948465, "grad_norm": 0.22386831045150757, "learning_rate": 2.789526690497979e-07, "loss": 0.7814, "num_input_tokens_seen": 37071656, "step": 64265 }, { "epoch": 9.572535001489426, "grad_norm": 0.20631694793701172, "learning_rate": 2.779853793564852e-07, "loss": 0.803, "num_input_tokens_seen": 37074600, "step": 64270 }, { "epoch": 9.573279714030384, "grad_norm": 0.2693535387516022, "learning_rate": 2.7701976027640353e-07, "loss": 0.7926, "num_input_tokens_seen": 37077384, "step": 64275 }, { "epoch": 9.574024426571343, "grad_norm": 0.29628661274909973, "learning_rate": 2.7605581187481467e-07, "loss": 0.7837, "num_input_tokens_seen": 37080584, "step": 64280 }, { "epoch": 9.574769139112302, "grad_norm": 0.20165066421031952, "learning_rate": 2.750935342168526e-07, "loss": 0.791, "num_input_tokens_seen": 37083400, "step": 64285 }, { "epoch": 9.575513851653263, "grad_norm": 0.293448269367218, "learning_rate": 2.741329273675458e-07, "loss": 0.7819, "num_input_tokens_seen": 37086600, "step": 64290 }, { "epoch": 9.576258564194221, "grad_norm": 0.25482794642448425, "learning_rate": 2.7317399139180634e-07, "loss": 0.8219, "num_input_tokens_seen": 37089288, "step": 64295 }, { "epoch": 9.57700327673518, "grad_norm": 0.19367893040180206, "learning_rate": 2.7221672635443783e-07, "loss": 0.7769, "num_input_tokens_seen": 37092136, "step": 64300 }, { "epoch": 9.577747989276139, "grad_norm": 0.3385026454925537, "learning_rate": 2.712611323201275e-07, "loss": 0.811, "num_input_tokens_seen": 37095208, "step": 64305 }, { "epoch": 9.5784927018171, "grad_norm": 0.18326519429683685, "learning_rate": 2.7030720935344867e-07, "loss": 0.8001, "num_input_tokens_seen": 37097928, "step": 64310 }, { "epoch": 9.579237414358058, "grad_norm": 0.2339717596769333, "learning_rate": 2.6935495751886644e-07, "loss": 0.7933, "num_input_tokens_seen": 37100712, "step": 64315 }, { "epoch": 9.579982126899017, "grad_norm": 0.19205088913440704, "learning_rate": 2.6840437688072653e-07, "loss": 0.781, "num_input_tokens_seen": 37103656, "step": 64320 }, { "epoch": 9.580726839439976, "grad_norm": 0.20803570747375488, "learning_rate": 2.6745546750326924e-07, "loss": 0.8236, "num_input_tokens_seen": 37106568, "step": 64325 }, { "epoch": 9.581471551980936, "grad_norm": 0.2416553646326065, "learning_rate": 2.665082294506155e-07, "loss": 0.7899, "num_input_tokens_seen": 37109128, "step": 64330 }, { "epoch": 9.582216264521895, "grad_norm": 0.33450573682785034, "learning_rate": 2.655626627867752e-07, "loss": 0.8186, "num_input_tokens_seen": 37112168, "step": 64335 }, { "epoch": 9.582960977062854, "grad_norm": 0.22516296803951263, "learning_rate": 2.6461876757565007e-07, "loss": 0.8139, "num_input_tokens_seen": 37114952, "step": 64340 }, { "epoch": 9.583705689603812, "grad_norm": 0.1980990320444107, "learning_rate": 2.6367654388102236e-07, "loss": 0.7738, "num_input_tokens_seen": 37118248, "step": 64345 }, { "epoch": 9.584450402144771, "grad_norm": 0.1724989414215088, "learning_rate": 2.6273599176656063e-07, "loss": 0.7858, "num_input_tokens_seen": 37121160, "step": 64350 }, { "epoch": 9.585195114685732, "grad_norm": 0.2415444701910019, "learning_rate": 2.617971112958278e-07, "loss": 0.7931, "num_input_tokens_seen": 37124264, "step": 64355 }, { "epoch": 9.58593982722669, "grad_norm": 0.1643584668636322, "learning_rate": 2.6085990253226776e-07, "loss": 0.8097, "num_input_tokens_seen": 37126888, "step": 64360 }, { "epoch": 9.58668453976765, "grad_norm": 0.19222727417945862, "learning_rate": 2.5992436553921304e-07, "loss": 0.8289, "num_input_tokens_seen": 37130056, "step": 64365 }, { "epoch": 9.58742925230861, "grad_norm": 0.1754036694765091, "learning_rate": 2.589905003798826e-07, "loss": 0.7727, "num_input_tokens_seen": 37132840, "step": 64370 }, { "epoch": 9.588173964849569, "grad_norm": 0.2537063956260681, "learning_rate": 2.58058307117387e-07, "loss": 0.7998, "num_input_tokens_seen": 37135976, "step": 64375 }, { "epoch": 9.588918677390527, "grad_norm": 0.2089383602142334, "learning_rate": 2.571277858147175e-07, "loss": 0.8237, "num_input_tokens_seen": 37139112, "step": 64380 }, { "epoch": 9.589663389931486, "grad_norm": 0.16138772666454315, "learning_rate": 2.561989365347545e-07, "loss": 0.8045, "num_input_tokens_seen": 37141704, "step": 64385 }, { "epoch": 9.590408102472445, "grad_norm": 0.20364905893802643, "learning_rate": 2.5527175934026426e-07, "loss": 0.8056, "num_input_tokens_seen": 37144424, "step": 64390 }, { "epoch": 9.591152815013405, "grad_norm": 0.24064978957176208, "learning_rate": 2.5434625429390515e-07, "loss": 0.779, "num_input_tokens_seen": 37147368, "step": 64395 }, { "epoch": 9.591897527554364, "grad_norm": 0.22588714957237244, "learning_rate": 2.534224214582187e-07, "loss": 0.8174, "num_input_tokens_seen": 37149992, "step": 64400 }, { "epoch": 9.592642240095323, "grad_norm": 0.2939435839653015, "learning_rate": 2.5250026089563004e-07, "loss": 0.8151, "num_input_tokens_seen": 37152904, "step": 64405 }, { "epoch": 9.593386952636282, "grad_norm": 0.2567099928855896, "learning_rate": 2.5157977266846157e-07, "loss": 0.8037, "num_input_tokens_seen": 37155784, "step": 64410 }, { "epoch": 9.594131665177242, "grad_norm": 0.22579532861709595, "learning_rate": 2.5066095683891067e-07, "loss": 0.7867, "num_input_tokens_seen": 37158632, "step": 64415 }, { "epoch": 9.594876377718201, "grad_norm": 0.16474281251430511, "learning_rate": 2.497438134690694e-07, "loss": 0.8147, "num_input_tokens_seen": 37161416, "step": 64420 }, { "epoch": 9.59562109025916, "grad_norm": 0.16330721974372864, "learning_rate": 2.4882834262091317e-07, "loss": 0.8043, "num_input_tokens_seen": 37164232, "step": 64425 }, { "epoch": 9.596365802800118, "grad_norm": 0.22372329235076904, "learning_rate": 2.4791454435630634e-07, "loss": 0.783, "num_input_tokens_seen": 37167272, "step": 64430 }, { "epoch": 9.597110515341079, "grad_norm": 0.21634799242019653, "learning_rate": 2.4700241873699957e-07, "loss": 0.7886, "num_input_tokens_seen": 37169992, "step": 64435 }, { "epoch": 9.597855227882038, "grad_norm": 0.18959420919418335, "learning_rate": 2.460919658246297e-07, "loss": 0.7841, "num_input_tokens_seen": 37172840, "step": 64440 }, { "epoch": 9.598599940422996, "grad_norm": 0.2550523281097412, "learning_rate": 2.4518318568072797e-07, "loss": 0.8142, "num_input_tokens_seen": 37175848, "step": 64445 }, { "epoch": 9.599344652963955, "grad_norm": 0.23365511000156403, "learning_rate": 2.442760783666953e-07, "loss": 0.7842, "num_input_tokens_seen": 37178632, "step": 64450 }, { "epoch": 9.600089365504916, "grad_norm": 0.24017933011054993, "learning_rate": 2.433706439438382e-07, "loss": 0.8012, "num_input_tokens_seen": 37181736, "step": 64455 }, { "epoch": 9.600834078045875, "grad_norm": 0.22453615069389343, "learning_rate": 2.4246688247334117e-07, "loss": 0.7883, "num_input_tokens_seen": 37184552, "step": 64460 }, { "epoch": 9.601578790586833, "grad_norm": 0.35645774006843567, "learning_rate": 2.4156479401627465e-07, "loss": 0.8068, "num_input_tokens_seen": 37187592, "step": 64465 }, { "epoch": 9.602323503127792, "grad_norm": 0.1556309312582016, "learning_rate": 2.4066437863359545e-07, "loss": 0.7835, "num_input_tokens_seen": 37190248, "step": 64470 }, { "epoch": 9.603068215668753, "grad_norm": 0.21097822487354279, "learning_rate": 2.397656363861578e-07, "loss": 0.798, "num_input_tokens_seen": 37193096, "step": 64475 }, { "epoch": 9.603812928209711, "grad_norm": 0.19189774990081787, "learning_rate": 2.388685673346908e-07, "loss": 0.8011, "num_input_tokens_seen": 37195784, "step": 64480 }, { "epoch": 9.60455764075067, "grad_norm": 0.23483189940452576, "learning_rate": 2.379731715398098e-07, "loss": 0.811, "num_input_tokens_seen": 37198536, "step": 64485 }, { "epoch": 9.605302353291629, "grad_norm": 0.17672474682331085, "learning_rate": 2.3707944906203038e-07, "loss": 0.7884, "num_input_tokens_seen": 37201256, "step": 64490 }, { "epoch": 9.60604706583259, "grad_norm": 0.21237599849700928, "learning_rate": 2.361873999617431e-07, "loss": 0.8086, "num_input_tokens_seen": 37203912, "step": 64495 }, { "epoch": 9.606791778373548, "grad_norm": 0.17005546391010284, "learning_rate": 2.352970242992303e-07, "loss": 0.8113, "num_input_tokens_seen": 37206664, "step": 64500 }, { "epoch": 9.607536490914507, "grad_norm": 0.2806089222431183, "learning_rate": 2.344083221346549e-07, "loss": 0.8147, "num_input_tokens_seen": 37209800, "step": 64505 }, { "epoch": 9.608281203455466, "grad_norm": 0.2954137623310089, "learning_rate": 2.3352129352808007e-07, "loss": 0.7985, "num_input_tokens_seen": 37212712, "step": 64510 }, { "epoch": 9.609025915996426, "grad_norm": 0.25532081723213196, "learning_rate": 2.326359385394383e-07, "loss": 0.7355, "num_input_tokens_seen": 37215656, "step": 64515 }, { "epoch": 9.609770628537385, "grad_norm": 0.201624795794487, "learning_rate": 2.317522572285652e-07, "loss": 0.7935, "num_input_tokens_seen": 37218472, "step": 64520 }, { "epoch": 9.610515341078344, "grad_norm": 0.195009246468544, "learning_rate": 2.30870249655174e-07, "loss": 0.7914, "num_input_tokens_seen": 37221064, "step": 64525 }, { "epoch": 9.611260053619302, "grad_norm": 0.23415213823318481, "learning_rate": 2.2998991587886709e-07, "loss": 0.777, "num_input_tokens_seen": 37223816, "step": 64530 }, { "epoch": 9.612004766160261, "grad_norm": 0.24272240698337555, "learning_rate": 2.2911125595913296e-07, "loss": 0.789, "num_input_tokens_seen": 37226600, "step": 64535 }, { "epoch": 9.612749478701222, "grad_norm": 0.2401636242866516, "learning_rate": 2.2823426995535192e-07, "loss": 0.7979, "num_input_tokens_seen": 37229512, "step": 64540 }, { "epoch": 9.61349419124218, "grad_norm": 0.14336304366588593, "learning_rate": 2.2735895792678485e-07, "loss": 0.7887, "num_input_tokens_seen": 37232360, "step": 64545 }, { "epoch": 9.61423890378314, "grad_norm": 0.20307838916778564, "learning_rate": 2.2648531993257893e-07, "loss": 0.7969, "num_input_tokens_seen": 37235304, "step": 64550 }, { "epoch": 9.6149836163241, "grad_norm": 0.3045879006385803, "learning_rate": 2.2561335603177302e-07, "loss": 0.7947, "num_input_tokens_seen": 37238216, "step": 64555 }, { "epoch": 9.615728328865059, "grad_norm": 0.21227313578128815, "learning_rate": 2.2474306628329222e-07, "loss": 0.8423, "num_input_tokens_seen": 37240904, "step": 64560 }, { "epoch": 9.616473041406017, "grad_norm": 0.2201763540506363, "learning_rate": 2.2387445074594505e-07, "loss": 0.7716, "num_input_tokens_seen": 37243976, "step": 64565 }, { "epoch": 9.617217753946976, "grad_norm": 0.27436837553977966, "learning_rate": 2.2300750947843174e-07, "loss": 0.7922, "num_input_tokens_seen": 37246728, "step": 64570 }, { "epoch": 9.617962466487935, "grad_norm": 0.20973685383796692, "learning_rate": 2.2214224253933326e-07, "loss": 0.7871, "num_input_tokens_seen": 37249512, "step": 64575 }, { "epoch": 9.618707179028895, "grad_norm": 0.3624193072319031, "learning_rate": 2.21278649987125e-07, "loss": 0.868, "num_input_tokens_seen": 37252552, "step": 64580 }, { "epoch": 9.619451891569854, "grad_norm": 0.3070467710494995, "learning_rate": 2.204167318801603e-07, "loss": 0.7874, "num_input_tokens_seen": 37255560, "step": 64585 }, { "epoch": 9.620196604110813, "grad_norm": 0.18772874772548676, "learning_rate": 2.1955648827668708e-07, "loss": 0.7937, "num_input_tokens_seen": 37258280, "step": 64590 }, { "epoch": 9.620941316651772, "grad_norm": 0.18391703069210052, "learning_rate": 2.186979192348365e-07, "loss": 0.788, "num_input_tokens_seen": 37261064, "step": 64595 }, { "epoch": 9.621686029192732, "grad_norm": 0.28373637795448303, "learning_rate": 2.178410248126289e-07, "loss": 0.7921, "num_input_tokens_seen": 37263880, "step": 64600 }, { "epoch": 9.622430741733691, "grad_norm": 0.22687679529190063, "learning_rate": 2.1698580506796517e-07, "loss": 0.7838, "num_input_tokens_seen": 37266504, "step": 64605 }, { "epoch": 9.62317545427465, "grad_norm": 0.26662546396255493, "learning_rate": 2.1613226005864074e-07, "loss": 0.8146, "num_input_tokens_seen": 37269320, "step": 64610 }, { "epoch": 9.623920166815608, "grad_norm": 0.1519821733236313, "learning_rate": 2.1528038984233722e-07, "loss": 0.793, "num_input_tokens_seen": 37272200, "step": 64615 }, { "epoch": 9.624664879356569, "grad_norm": 0.2990172505378723, "learning_rate": 2.1443019447661417e-07, "loss": 0.8203, "num_input_tokens_seen": 37275208, "step": 64620 }, { "epoch": 9.625409591897528, "grad_norm": 0.20667888224124908, "learning_rate": 2.1358167401892838e-07, "loss": 0.7824, "num_input_tokens_seen": 37278088, "step": 64625 }, { "epoch": 9.626154304438487, "grad_norm": 0.20604035258293152, "learning_rate": 2.1273482852662007e-07, "loss": 0.7777, "num_input_tokens_seen": 37280872, "step": 64630 }, { "epoch": 9.626899016979445, "grad_norm": 0.1976003795862198, "learning_rate": 2.1188965805691297e-07, "loss": 0.7978, "num_input_tokens_seen": 37283848, "step": 64635 }, { "epoch": 9.627643729520406, "grad_norm": 0.21288326382637024, "learning_rate": 2.1104616266692524e-07, "loss": 0.798, "num_input_tokens_seen": 37286760, "step": 64640 }, { "epoch": 9.628388442061365, "grad_norm": 0.1739601045846939, "learning_rate": 2.1020434241365017e-07, "loss": 0.7865, "num_input_tokens_seen": 37289448, "step": 64645 }, { "epoch": 9.629133154602323, "grad_norm": 0.19623033702373505, "learning_rate": 2.0936419735397562e-07, "loss": 0.8103, "num_input_tokens_seen": 37292168, "step": 64650 }, { "epoch": 9.629877867143282, "grad_norm": 0.2394324392080307, "learning_rate": 2.0852572754468113e-07, "loss": 0.7705, "num_input_tokens_seen": 37295432, "step": 64655 }, { "epoch": 9.630622579684243, "grad_norm": 0.1898649036884308, "learning_rate": 2.0768893304242142e-07, "loss": 0.8025, "num_input_tokens_seen": 37298088, "step": 64660 }, { "epoch": 9.631367292225201, "grad_norm": 0.2494773119688034, "learning_rate": 2.0685381390374568e-07, "loss": 0.7988, "num_input_tokens_seen": 37300904, "step": 64665 }, { "epoch": 9.63211200476616, "grad_norm": 0.21638382971286774, "learning_rate": 2.0602037018508658e-07, "loss": 0.8159, "num_input_tokens_seen": 37303912, "step": 64670 }, { "epoch": 9.632856717307119, "grad_norm": 0.21639488637447357, "learning_rate": 2.0518860194276846e-07, "loss": 0.8103, "num_input_tokens_seen": 37306728, "step": 64675 }, { "epoch": 9.63360142984808, "grad_norm": 0.247780442237854, "learning_rate": 2.043585092329936e-07, "loss": 0.7971, "num_input_tokens_seen": 37309256, "step": 64680 }, { "epoch": 9.634346142389038, "grad_norm": 0.20613369345664978, "learning_rate": 2.035300921118616e-07, "loss": 0.8143, "num_input_tokens_seen": 37311976, "step": 64685 }, { "epoch": 9.635090854929997, "grad_norm": 0.2795129716396332, "learning_rate": 2.0270335063534706e-07, "loss": 0.7987, "num_input_tokens_seen": 37314728, "step": 64690 }, { "epoch": 9.635835567470956, "grad_norm": 0.21185427904129028, "learning_rate": 2.018782848593248e-07, "loss": 0.8043, "num_input_tokens_seen": 37317768, "step": 64695 }, { "epoch": 9.636580280011916, "grad_norm": 0.2288254052400589, "learning_rate": 2.0105489483954466e-07, "loss": 0.805, "num_input_tokens_seen": 37320776, "step": 64700 }, { "epoch": 9.637324992552875, "grad_norm": 0.20417770743370056, "learning_rate": 2.0023318063165098e-07, "loss": 0.794, "num_input_tokens_seen": 37323656, "step": 64705 }, { "epoch": 9.638069705093834, "grad_norm": 0.25349459052085876, "learning_rate": 1.9941314229117157e-07, "loss": 0.8745, "num_input_tokens_seen": 37326792, "step": 64710 }, { "epoch": 9.638814417634793, "grad_norm": 0.24595874547958374, "learning_rate": 1.9859477987351771e-07, "loss": 0.8267, "num_input_tokens_seen": 37329736, "step": 64715 }, { "epoch": 9.639559130175751, "grad_norm": 0.26219940185546875, "learning_rate": 1.9777809343399234e-07, "loss": 0.7816, "num_input_tokens_seen": 37332648, "step": 64720 }, { "epoch": 9.640303842716712, "grad_norm": 0.1911228448152542, "learning_rate": 1.9696308302778744e-07, "loss": 0.7872, "num_input_tokens_seen": 37335752, "step": 64725 }, { "epoch": 9.64104855525767, "grad_norm": 0.21514162421226501, "learning_rate": 1.961497487099756e-07, "loss": 0.7764, "num_input_tokens_seen": 37338472, "step": 64730 }, { "epoch": 9.64179326779863, "grad_norm": 0.22473502159118652, "learning_rate": 1.9533809053551565e-07, "loss": 0.7923, "num_input_tokens_seen": 37341448, "step": 64735 }, { "epoch": 9.642537980339588, "grad_norm": 0.19332577288150787, "learning_rate": 1.9452810855926372e-07, "loss": 0.7873, "num_input_tokens_seen": 37344392, "step": 64740 }, { "epoch": 9.643282692880549, "grad_norm": 0.23389078676700592, "learning_rate": 1.9371980283594826e-07, "loss": 0.8064, "num_input_tokens_seen": 37347080, "step": 64745 }, { "epoch": 9.644027405421507, "grad_norm": 0.1717722862958908, "learning_rate": 1.929131734201922e-07, "loss": 0.7702, "num_input_tokens_seen": 37349896, "step": 64750 }, { "epoch": 9.644772117962466, "grad_norm": 0.2227514684200287, "learning_rate": 1.9210822036650755e-07, "loss": 0.8035, "num_input_tokens_seen": 37352712, "step": 64755 }, { "epoch": 9.645516830503425, "grad_norm": 0.21489283442497253, "learning_rate": 1.9130494372928688e-07, "loss": 0.7736, "num_input_tokens_seen": 37355720, "step": 64760 }, { "epoch": 9.646261543044385, "grad_norm": 0.20290018618106842, "learning_rate": 1.9050334356281175e-07, "loss": 0.7778, "num_input_tokens_seen": 37358408, "step": 64765 }, { "epoch": 9.647006255585344, "grad_norm": 0.2849222719669342, "learning_rate": 1.897034199212555e-07, "loss": 0.7764, "num_input_tokens_seen": 37361544, "step": 64770 }, { "epoch": 9.647750968126303, "grad_norm": 0.19916607439517975, "learning_rate": 1.8890517285866938e-07, "loss": 0.809, "num_input_tokens_seen": 37364328, "step": 64775 }, { "epoch": 9.648495680667262, "grad_norm": 0.2143007516860962, "learning_rate": 1.881086024289963e-07, "loss": 0.7829, "num_input_tokens_seen": 37367464, "step": 64780 }, { "epoch": 9.649240393208222, "grad_norm": 0.21752913296222687, "learning_rate": 1.8731370868606824e-07, "loss": 0.8367, "num_input_tokens_seen": 37370216, "step": 64785 }, { "epoch": 9.649985105749181, "grad_norm": 0.22968484461307526, "learning_rate": 1.8652049168359774e-07, "loss": 0.7978, "num_input_tokens_seen": 37372840, "step": 64790 }, { "epoch": 9.65072981829014, "grad_norm": 0.19643840193748474, "learning_rate": 1.857289514751892e-07, "loss": 0.7841, "num_input_tokens_seen": 37375816, "step": 64795 }, { "epoch": 9.651474530831099, "grad_norm": 0.23643934726715088, "learning_rate": 1.849390881143276e-07, "loss": 0.782, "num_input_tokens_seen": 37378664, "step": 64800 }, { "epoch": 9.652219243372059, "grad_norm": 0.19526337087154388, "learning_rate": 1.8415090165439519e-07, "loss": 0.7822, "num_input_tokens_seen": 37381384, "step": 64805 }, { "epoch": 9.652963955913018, "grad_norm": 0.24415916204452515, "learning_rate": 1.8336439214864943e-07, "loss": 0.7983, "num_input_tokens_seen": 37384072, "step": 64810 }, { "epoch": 9.653708668453977, "grad_norm": 0.20002955198287964, "learning_rate": 1.8257955965023943e-07, "loss": 0.7876, "num_input_tokens_seen": 37386856, "step": 64815 }, { "epoch": 9.654453380994935, "grad_norm": 0.18651168048381805, "learning_rate": 1.8179640421220333e-07, "loss": 0.8106, "num_input_tokens_seen": 37389480, "step": 64820 }, { "epoch": 9.655198093535896, "grad_norm": 0.23645254969596863, "learning_rate": 1.8101492588746549e-07, "loss": 0.8187, "num_input_tokens_seen": 37392200, "step": 64825 }, { "epoch": 9.655942806076855, "grad_norm": 0.304508239030838, "learning_rate": 1.8023512472883087e-07, "loss": 0.7911, "num_input_tokens_seen": 37395016, "step": 64830 }, { "epoch": 9.656687518617813, "grad_norm": 0.23037198185920715, "learning_rate": 1.794570007889962e-07, "loss": 0.7891, "num_input_tokens_seen": 37397704, "step": 64835 }, { "epoch": 9.657432231158772, "grad_norm": 0.18335972726345062, "learning_rate": 1.7868055412054442e-07, "loss": 0.798, "num_input_tokens_seen": 37400424, "step": 64840 }, { "epoch": 9.658176943699733, "grad_norm": 0.278017520904541, "learning_rate": 1.7790578477594466e-07, "loss": 0.7328, "num_input_tokens_seen": 37404648, "step": 64845 }, { "epoch": 9.658921656240691, "grad_norm": 0.16521313786506653, "learning_rate": 1.771326928075523e-07, "loss": 0.8132, "num_input_tokens_seen": 37407656, "step": 64850 }, { "epoch": 9.65966636878165, "grad_norm": 0.2722867429256439, "learning_rate": 1.7636127826760884e-07, "loss": 0.77, "num_input_tokens_seen": 37410408, "step": 64855 }, { "epoch": 9.660411081322609, "grad_norm": 0.2569904625415802, "learning_rate": 1.7559154120824483e-07, "loss": 0.8102, "num_input_tokens_seen": 37413064, "step": 64860 }, { "epoch": 9.66115579386357, "grad_norm": 0.3817636966705322, "learning_rate": 1.7482348168147978e-07, "loss": 0.8248, "num_input_tokens_seen": 37416040, "step": 64865 }, { "epoch": 9.661900506404528, "grad_norm": 0.2602488398551941, "learning_rate": 1.7405709973920824e-07, "loss": 0.8029, "num_input_tokens_seen": 37419176, "step": 64870 }, { "epoch": 9.662645218945487, "grad_norm": 0.22780217230319977, "learning_rate": 1.7329239543322494e-07, "loss": 0.8074, "num_input_tokens_seen": 37421992, "step": 64875 }, { "epoch": 9.663389931486446, "grad_norm": 0.2013959288597107, "learning_rate": 1.7252936881520244e-07, "loss": 0.8095, "num_input_tokens_seen": 37425128, "step": 64880 }, { "epoch": 9.664134644027406, "grad_norm": 0.3865635395050049, "learning_rate": 1.7176801993670499e-07, "loss": 0.8055, "num_input_tokens_seen": 37427976, "step": 64885 }, { "epoch": 9.664879356568365, "grad_norm": 0.2575259208679199, "learning_rate": 1.7100834884918037e-07, "loss": 0.829, "num_input_tokens_seen": 37430952, "step": 64890 }, { "epoch": 9.665624069109324, "grad_norm": 0.1951572149991989, "learning_rate": 1.7025035560396252e-07, "loss": 0.8044, "num_input_tokens_seen": 37433576, "step": 64895 }, { "epoch": 9.666368781650283, "grad_norm": 0.19867752492427826, "learning_rate": 1.6949404025227435e-07, "loss": 0.8076, "num_input_tokens_seen": 37436904, "step": 64900 }, { "epoch": 9.667113494191241, "grad_norm": 0.2887886166572571, "learning_rate": 1.6873940284523048e-07, "loss": 0.7926, "num_input_tokens_seen": 37439880, "step": 64905 }, { "epoch": 9.667858206732202, "grad_norm": 0.22623680531978607, "learning_rate": 1.6798644343381798e-07, "loss": 0.8044, "num_input_tokens_seen": 37442760, "step": 64910 }, { "epoch": 9.66860291927316, "grad_norm": 0.18317900598049164, "learning_rate": 1.672351620689211e-07, "loss": 0.8054, "num_input_tokens_seen": 37445736, "step": 64915 }, { "epoch": 9.66934763181412, "grad_norm": 0.19892476499080658, "learning_rate": 1.6648555880131033e-07, "loss": 0.7822, "num_input_tokens_seen": 37448904, "step": 64920 }, { "epoch": 9.670092344355078, "grad_norm": 0.21764744818210602, "learning_rate": 1.6573763368163964e-07, "loss": 0.794, "num_input_tokens_seen": 37451752, "step": 64925 }, { "epoch": 9.670837056896039, "grad_norm": 0.22612518072128296, "learning_rate": 1.6499138676045188e-07, "loss": 0.7879, "num_input_tokens_seen": 37454728, "step": 64930 }, { "epoch": 9.671581769436997, "grad_norm": 0.21459484100341797, "learning_rate": 1.6424681808817343e-07, "loss": 0.7904, "num_input_tokens_seen": 37457416, "step": 64935 }, { "epoch": 9.672326481977956, "grad_norm": 0.28175729513168335, "learning_rate": 1.6350392771512234e-07, "loss": 0.8087, "num_input_tokens_seen": 37460424, "step": 64940 }, { "epoch": 9.673071194518915, "grad_norm": 0.13352519273757935, "learning_rate": 1.6276271569149738e-07, "loss": 0.8156, "num_input_tokens_seen": 37463560, "step": 64945 }, { "epoch": 9.673815907059875, "grad_norm": 0.19022808969020844, "learning_rate": 1.6202318206738342e-07, "loss": 0.8057, "num_input_tokens_seen": 37466376, "step": 64950 }, { "epoch": 9.674560619600834, "grad_norm": 0.26093199849128723, "learning_rate": 1.6128532689276277e-07, "loss": 0.8126, "num_input_tokens_seen": 37469256, "step": 64955 }, { "epoch": 9.675305332141793, "grad_norm": 0.2917397618293762, "learning_rate": 1.6054915021748996e-07, "loss": 0.7927, "num_input_tokens_seen": 37472264, "step": 64960 }, { "epoch": 9.676050044682752, "grad_norm": 0.22516243159770966, "learning_rate": 1.5981465209131686e-07, "loss": 0.7832, "num_input_tokens_seen": 37475272, "step": 64965 }, { "epoch": 9.676794757223712, "grad_norm": 0.2174566239118576, "learning_rate": 1.5908183256387877e-07, "loss": 0.8223, "num_input_tokens_seen": 37478024, "step": 64970 }, { "epoch": 9.677539469764671, "grad_norm": 0.14403294026851654, "learning_rate": 1.583506916846944e-07, "loss": 0.7957, "num_input_tokens_seen": 37480808, "step": 64975 }, { "epoch": 9.67828418230563, "grad_norm": 0.19212156534194946, "learning_rate": 1.5762122950316871e-07, "loss": 0.7951, "num_input_tokens_seen": 37483624, "step": 64980 }, { "epoch": 9.679028894846589, "grad_norm": 0.21904601156711578, "learning_rate": 1.5689344606860112e-07, "loss": 0.8368, "num_input_tokens_seen": 37486696, "step": 64985 }, { "epoch": 9.679773607387549, "grad_norm": 0.299285888671875, "learning_rate": 1.5616734143016898e-07, "loss": 0.7956, "num_input_tokens_seen": 37489384, "step": 64990 }, { "epoch": 9.680518319928508, "grad_norm": 0.15440060198307037, "learning_rate": 1.5544291563693858e-07, "loss": 0.7982, "num_input_tokens_seen": 37492104, "step": 64995 }, { "epoch": 9.681263032469467, "grad_norm": 0.2584143877029419, "learning_rate": 1.5472016873786798e-07, "loss": 0.7855, "num_input_tokens_seen": 37495336, "step": 65000 }, { "epoch": 9.682007745010425, "grad_norm": 0.22715039551258087, "learning_rate": 1.5399910078179314e-07, "loss": 0.8108, "num_input_tokens_seen": 37498120, "step": 65005 }, { "epoch": 9.682752457551386, "grad_norm": 0.25714895129203796, "learning_rate": 1.532797118174417e-07, "loss": 0.826, "num_input_tokens_seen": 37500936, "step": 65010 }, { "epoch": 9.683497170092345, "grad_norm": 0.18973472714424133, "learning_rate": 1.5256200189343038e-07, "loss": 0.7865, "num_input_tokens_seen": 37503464, "step": 65015 }, { "epoch": 9.684241882633303, "grad_norm": 0.1957906186580658, "learning_rate": 1.518459710582565e-07, "loss": 0.7963, "num_input_tokens_seen": 37506312, "step": 65020 }, { "epoch": 9.684986595174262, "grad_norm": 0.2179616093635559, "learning_rate": 1.511316193603063e-07, "loss": 0.7958, "num_input_tokens_seen": 37509128, "step": 65025 }, { "epoch": 9.685731307715223, "grad_norm": 0.24046745896339417, "learning_rate": 1.504189468478523e-07, "loss": 0.8071, "num_input_tokens_seen": 37511976, "step": 65030 }, { "epoch": 9.686476020256181, "grad_norm": 0.2579069137573242, "learning_rate": 1.497079535690532e-07, "loss": 0.7562, "num_input_tokens_seen": 37514952, "step": 65035 }, { "epoch": 9.68722073279714, "grad_norm": 0.2164759486913681, "learning_rate": 1.4899863957195948e-07, "loss": 0.7993, "num_input_tokens_seen": 37517704, "step": 65040 }, { "epoch": 9.687965445338099, "grad_norm": 0.32220259308815, "learning_rate": 1.4829100490449942e-07, "loss": 0.7964, "num_input_tokens_seen": 37520488, "step": 65045 }, { "epoch": 9.688710157879058, "grad_norm": 0.3264293670654297, "learning_rate": 1.4758504961449315e-07, "loss": 0.769, "num_input_tokens_seen": 37523624, "step": 65050 }, { "epoch": 9.689454870420018, "grad_norm": 0.1396968811750412, "learning_rate": 1.468807737496497e-07, "loss": 0.8335, "num_input_tokens_seen": 37526344, "step": 65055 }, { "epoch": 9.690199582960977, "grad_norm": 0.2367973029613495, "learning_rate": 1.4617817735755323e-07, "loss": 0.799, "num_input_tokens_seen": 37529032, "step": 65060 }, { "epoch": 9.690944295501936, "grad_norm": 0.23352396488189697, "learning_rate": 1.4547726048569077e-07, "loss": 0.7981, "num_input_tokens_seen": 37531976, "step": 65065 }, { "epoch": 9.691689008042896, "grad_norm": 0.2159416824579239, "learning_rate": 1.447780231814244e-07, "loss": 0.7962, "num_input_tokens_seen": 37535016, "step": 65070 }, { "epoch": 9.692433720583855, "grad_norm": 0.17747388780117035, "learning_rate": 1.4408046549200528e-07, "loss": 0.8161, "num_input_tokens_seen": 37537800, "step": 65075 }, { "epoch": 9.693178433124814, "grad_norm": 0.2748667895793915, "learning_rate": 1.4338458746457062e-07, "loss": 0.7861, "num_input_tokens_seen": 37541192, "step": 65080 }, { "epoch": 9.693923145665773, "grad_norm": 0.22636790573596954, "learning_rate": 1.4269038914614397e-07, "loss": 0.8087, "num_input_tokens_seen": 37543976, "step": 65085 }, { "epoch": 9.694667858206731, "grad_norm": 0.18579773604869843, "learning_rate": 1.4199787058364056e-07, "loss": 0.7952, "num_input_tokens_seen": 37547080, "step": 65090 }, { "epoch": 9.695412570747692, "grad_norm": 0.21616239845752716, "learning_rate": 1.413070318238535e-07, "loss": 0.8066, "num_input_tokens_seen": 37550120, "step": 65095 }, { "epoch": 9.69615728328865, "grad_norm": 0.3078809976577759, "learning_rate": 1.4061787291347051e-07, "loss": 0.8298, "num_input_tokens_seen": 37553032, "step": 65100 }, { "epoch": 9.69690199582961, "grad_norm": 0.2655397653579712, "learning_rate": 1.399303938990626e-07, "loss": 0.793, "num_input_tokens_seen": 37556232, "step": 65105 }, { "epoch": 9.697646708370568, "grad_norm": 0.18788984417915344, "learning_rate": 1.392445948270843e-07, "loss": 0.7886, "num_input_tokens_seen": 37559144, "step": 65110 }, { "epoch": 9.698391420911529, "grad_norm": 0.17648863792419434, "learning_rate": 1.385604757438791e-07, "loss": 0.801, "num_input_tokens_seen": 37561992, "step": 65115 }, { "epoch": 9.699136133452487, "grad_norm": 0.18984578549861908, "learning_rate": 1.3787803669567667e-07, "loss": 0.8057, "num_input_tokens_seen": 37564616, "step": 65120 }, { "epoch": 9.699880845993446, "grad_norm": 0.28790318965911865, "learning_rate": 1.371972777285957e-07, "loss": 0.7989, "num_input_tokens_seen": 37567816, "step": 65125 }, { "epoch": 9.700625558534405, "grad_norm": 0.18273143470287323, "learning_rate": 1.3651819888863548e-07, "loss": 0.7715, "num_input_tokens_seen": 37570696, "step": 65130 }, { "epoch": 9.701370271075366, "grad_norm": 0.3081246316432953, "learning_rate": 1.3584080022169266e-07, "loss": 0.8219, "num_input_tokens_seen": 37574024, "step": 65135 }, { "epoch": 9.702114983616324, "grad_norm": 0.21368420124053955, "learning_rate": 1.3516508177353337e-07, "loss": 0.8051, "num_input_tokens_seen": 37576904, "step": 65140 }, { "epoch": 9.702859696157283, "grad_norm": 0.272771954536438, "learning_rate": 1.3449104358982944e-07, "loss": 0.7951, "num_input_tokens_seen": 37579720, "step": 65145 }, { "epoch": 9.703604408698242, "grad_norm": 0.25195592641830444, "learning_rate": 1.3381868571612222e-07, "loss": 0.8323, "num_input_tokens_seen": 37582280, "step": 65150 }, { "epoch": 9.704349121239202, "grad_norm": 0.1973402202129364, "learning_rate": 1.3314800819785035e-07, "loss": 0.8411, "num_input_tokens_seen": 37584808, "step": 65155 }, { "epoch": 9.705093833780161, "grad_norm": 0.20922242105007172, "learning_rate": 1.3247901108033313e-07, "loss": 0.7731, "num_input_tokens_seen": 37587848, "step": 65160 }, { "epoch": 9.70583854632112, "grad_norm": 0.19135043025016785, "learning_rate": 1.318116944087816e-07, "loss": 0.8158, "num_input_tokens_seen": 37590888, "step": 65165 }, { "epoch": 9.706583258862079, "grad_norm": 0.18573909997940063, "learning_rate": 1.3114605822829028e-07, "loss": 0.7794, "num_input_tokens_seen": 37593416, "step": 65170 }, { "epoch": 9.70732797140304, "grad_norm": 0.2972489297389984, "learning_rate": 1.304821025838371e-07, "loss": 0.8177, "num_input_tokens_seen": 37596552, "step": 65175 }, { "epoch": 9.708072683943998, "grad_norm": 0.19822697341442108, "learning_rate": 1.2981982752029164e-07, "loss": 0.7929, "num_input_tokens_seen": 37599464, "step": 65180 }, { "epoch": 9.708817396484957, "grad_norm": 0.25039827823638916, "learning_rate": 1.2915923308240984e-07, "loss": 0.7892, "num_input_tokens_seen": 37602248, "step": 65185 }, { "epoch": 9.709562109025915, "grad_norm": 0.24291303753852844, "learning_rate": 1.2850031931482543e-07, "loss": 0.8111, "num_input_tokens_seen": 37605288, "step": 65190 }, { "epoch": 9.710306821566876, "grad_norm": 0.26125404238700867, "learning_rate": 1.278430862620722e-07, "loss": 0.7997, "num_input_tokens_seen": 37607976, "step": 65195 }, { "epoch": 9.711051534107835, "grad_norm": 0.20894932746887207, "learning_rate": 1.2718753396855908e-07, "loss": 0.7853, "num_input_tokens_seen": 37610888, "step": 65200 }, { "epoch": 9.711796246648793, "grad_norm": 0.29695072770118713, "learning_rate": 1.2653366247858955e-07, "loss": 0.7994, "num_input_tokens_seen": 37613704, "step": 65205 }, { "epoch": 9.712540959189752, "grad_norm": 0.15289394557476044, "learning_rate": 1.258814718363449e-07, "loss": 0.805, "num_input_tokens_seen": 37616520, "step": 65210 }, { "epoch": 9.713285671730713, "grad_norm": 0.2281665951013565, "learning_rate": 1.2523096208589823e-07, "loss": 0.7877, "num_input_tokens_seen": 37619432, "step": 65215 }, { "epoch": 9.714030384271672, "grad_norm": 0.2718381881713867, "learning_rate": 1.245821332712116e-07, "loss": 0.7757, "num_input_tokens_seen": 37622344, "step": 65220 }, { "epoch": 9.71477509681263, "grad_norm": 0.24238263070583344, "learning_rate": 1.2393498543612769e-07, "loss": 0.8336, "num_input_tokens_seen": 37625000, "step": 65225 }, { "epoch": 9.715519809353589, "grad_norm": 0.20108267664909363, "learning_rate": 1.232895186243782e-07, "loss": 0.7865, "num_input_tokens_seen": 37627944, "step": 65230 }, { "epoch": 9.716264521894548, "grad_norm": 0.261739045381546, "learning_rate": 1.2264573287958382e-07, "loss": 0.7841, "num_input_tokens_seen": 37630696, "step": 65235 }, { "epoch": 9.717009234435508, "grad_norm": 0.19627319276332855, "learning_rate": 1.220036282452458e-07, "loss": 0.7863, "num_input_tokens_seen": 37633608, "step": 65240 }, { "epoch": 9.717753946976467, "grad_norm": 0.2794288098812103, "learning_rate": 1.213632047647545e-07, "loss": 0.7945, "num_input_tokens_seen": 37636488, "step": 65245 }, { "epoch": 9.718498659517426, "grad_norm": 0.2366807907819748, "learning_rate": 1.2072446248138912e-07, "loss": 0.7662, "num_input_tokens_seen": 37639560, "step": 65250 }, { "epoch": 9.719243372058386, "grad_norm": 0.1917020082473755, "learning_rate": 1.200874014383152e-07, "loss": 0.8034, "num_input_tokens_seen": 37642568, "step": 65255 }, { "epoch": 9.719988084599345, "grad_norm": 0.3180859088897705, "learning_rate": 1.1945202167857882e-07, "loss": 0.8177, "num_input_tokens_seen": 37645576, "step": 65260 }, { "epoch": 9.720732797140304, "grad_norm": 0.23789675533771515, "learning_rate": 1.188183232451151e-07, "loss": 0.7959, "num_input_tokens_seen": 37648488, "step": 65265 }, { "epoch": 9.721477509681263, "grad_norm": 0.20781603455543518, "learning_rate": 1.1818630618075366e-07, "loss": 0.8439, "num_input_tokens_seen": 37651464, "step": 65270 }, { "epoch": 9.722222222222221, "grad_norm": 0.1961425244808197, "learning_rate": 1.1755597052819922e-07, "loss": 0.8119, "num_input_tokens_seen": 37654280, "step": 65275 }, { "epoch": 9.722966934763182, "grad_norm": 0.22353948652744293, "learning_rate": 1.169273163300455e-07, "loss": 0.7763, "num_input_tokens_seen": 37656840, "step": 65280 }, { "epoch": 9.72371164730414, "grad_norm": 0.22430862486362457, "learning_rate": 1.1630034362877796e-07, "loss": 0.8063, "num_input_tokens_seen": 37659752, "step": 65285 }, { "epoch": 9.7244563598451, "grad_norm": 0.24308301508426666, "learning_rate": 1.1567505246676269e-07, "loss": 0.8278, "num_input_tokens_seen": 37662888, "step": 65290 }, { "epoch": 9.725201072386058, "grad_norm": 0.32333704829216003, "learning_rate": 1.150514428862548e-07, "loss": 0.8207, "num_input_tokens_seen": 37665896, "step": 65295 }, { "epoch": 9.725945784927019, "grad_norm": 0.19358162581920624, "learning_rate": 1.1442951492939835e-07, "loss": 0.8129, "num_input_tokens_seen": 37668904, "step": 65300 }, { "epoch": 9.726690497467978, "grad_norm": 0.2854115664958954, "learning_rate": 1.1380926863821528e-07, "loss": 0.8122, "num_input_tokens_seen": 37671592, "step": 65305 }, { "epoch": 9.727435210008936, "grad_norm": 0.22195002436637878, "learning_rate": 1.1319070405462207e-07, "loss": 0.7957, "num_input_tokens_seen": 37674824, "step": 65310 }, { "epoch": 9.728179922549895, "grad_norm": 0.4553593099117279, "learning_rate": 1.1257382122041859e-07, "loss": 0.8306, "num_input_tokens_seen": 37677640, "step": 65315 }, { "epoch": 9.728924635090856, "grad_norm": 0.20573315024375916, "learning_rate": 1.1195862017729097e-07, "loss": 0.7958, "num_input_tokens_seen": 37680520, "step": 65320 }, { "epoch": 9.729669347631814, "grad_norm": 0.22317898273468018, "learning_rate": 1.1134510096681427e-07, "loss": 0.7961, "num_input_tokens_seen": 37683368, "step": 65325 }, { "epoch": 9.730414060172773, "grad_norm": 0.21694129705429077, "learning_rate": 1.1073326363044423e-07, "loss": 0.8096, "num_input_tokens_seen": 37686056, "step": 65330 }, { "epoch": 9.731158772713732, "grad_norm": 0.224278524518013, "learning_rate": 1.1012310820952831e-07, "loss": 0.7907, "num_input_tokens_seen": 37688936, "step": 65335 }, { "epoch": 9.731903485254692, "grad_norm": 0.2714307904243469, "learning_rate": 1.0951463474529744e-07, "loss": 0.8174, "num_input_tokens_seen": 37691720, "step": 65340 }, { "epoch": 9.732648197795651, "grad_norm": 0.16542594134807587, "learning_rate": 1.0890784327887149e-07, "loss": 0.8069, "num_input_tokens_seen": 37694376, "step": 65345 }, { "epoch": 9.73339291033661, "grad_norm": 0.204095259308815, "learning_rate": 1.0830273385125378e-07, "loss": 0.8003, "num_input_tokens_seen": 37697480, "step": 65350 }, { "epoch": 9.734137622877569, "grad_norm": 0.20993474125862122, "learning_rate": 1.0769930650333382e-07, "loss": 0.7879, "num_input_tokens_seen": 37700360, "step": 65355 }, { "epoch": 9.73488233541853, "grad_norm": 0.16245482861995697, "learning_rate": 1.070975612758901e-07, "loss": 0.8006, "num_input_tokens_seen": 37702888, "step": 65360 }, { "epoch": 9.735627047959488, "grad_norm": 0.23715956509113312, "learning_rate": 1.0649749820958732e-07, "loss": 0.8185, "num_input_tokens_seen": 37705512, "step": 65365 }, { "epoch": 9.736371760500447, "grad_norm": 0.22259122133255005, "learning_rate": 1.058991173449736e-07, "loss": 0.7736, "num_input_tokens_seen": 37708136, "step": 65370 }, { "epoch": 9.737116473041405, "grad_norm": 0.2182357907295227, "learning_rate": 1.0530241872248326e-07, "loss": 0.7935, "num_input_tokens_seen": 37710984, "step": 65375 }, { "epoch": 9.737861185582366, "grad_norm": 0.19611337780952454, "learning_rate": 1.0470740238244237e-07, "loss": 0.7876, "num_input_tokens_seen": 37714024, "step": 65380 }, { "epoch": 9.738605898123325, "grad_norm": 0.25822412967681885, "learning_rate": 1.0411406836505766e-07, "loss": 0.8157, "num_input_tokens_seen": 37716840, "step": 65385 }, { "epoch": 9.739350610664284, "grad_norm": 0.15574084222316742, "learning_rate": 1.0352241671042762e-07, "loss": 0.8026, "num_input_tokens_seen": 37719688, "step": 65390 }, { "epoch": 9.740095323205242, "grad_norm": 0.1997651904821396, "learning_rate": 1.0293244745852859e-07, "loss": 0.7921, "num_input_tokens_seen": 37722248, "step": 65395 }, { "epoch": 9.740840035746203, "grad_norm": 0.18294347822666168, "learning_rate": 1.0234416064923146e-07, "loss": 0.7914, "num_input_tokens_seen": 37724840, "step": 65400 }, { "epoch": 9.741584748287162, "grad_norm": 0.24892006814479828, "learning_rate": 1.0175755632228779e-07, "loss": 0.789, "num_input_tokens_seen": 37727528, "step": 65405 }, { "epoch": 9.74232946082812, "grad_norm": 0.21255852282047272, "learning_rate": 1.0117263451734083e-07, "loss": 0.8095, "num_input_tokens_seen": 37730440, "step": 65410 }, { "epoch": 9.743074173369079, "grad_norm": 0.25532764196395874, "learning_rate": 1.005893952739173e-07, "loss": 0.8028, "num_input_tokens_seen": 37733320, "step": 65415 }, { "epoch": 9.743818885910038, "grad_norm": 0.2778966724872589, "learning_rate": 1.0000783863142738e-07, "loss": 0.7718, "num_input_tokens_seen": 37736456, "step": 65420 }, { "epoch": 9.744563598450998, "grad_norm": 0.22863803803920746, "learning_rate": 9.942796462917014e-08, "loss": 0.7882, "num_input_tokens_seen": 37739304, "step": 65425 }, { "epoch": 9.745308310991957, "grad_norm": 0.22751456499099731, "learning_rate": 9.884977330633649e-08, "loss": 0.8187, "num_input_tokens_seen": 37742024, "step": 65430 }, { "epoch": 9.746053023532916, "grad_norm": 0.1536756306886673, "learning_rate": 9.827326470199239e-08, "loss": 0.7761, "num_input_tokens_seen": 37745000, "step": 65435 }, { "epoch": 9.746797736073875, "grad_norm": 0.3332926034927368, "learning_rate": 9.769843885509834e-08, "loss": 0.7878, "num_input_tokens_seen": 37747848, "step": 65440 }, { "epoch": 9.747542448614835, "grad_norm": 0.20134063065052032, "learning_rate": 9.712529580449825e-08, "loss": 0.7971, "num_input_tokens_seen": 37751016, "step": 65445 }, { "epoch": 9.748287161155794, "grad_norm": 0.2816855311393738, "learning_rate": 9.655383558892228e-08, "loss": 0.79, "num_input_tokens_seen": 37753896, "step": 65450 }, { "epoch": 9.749031873696753, "grad_norm": 0.2160962074995041, "learning_rate": 9.598405824698953e-08, "loss": 0.7785, "num_input_tokens_seen": 37756968, "step": 65455 }, { "epoch": 9.749776586237711, "grad_norm": 0.2371128797531128, "learning_rate": 9.541596381719976e-08, "loss": 0.7949, "num_input_tokens_seen": 37759880, "step": 65460 }, { "epoch": 9.750521298778672, "grad_norm": 0.25281822681427, "learning_rate": 9.484955233794723e-08, "loss": 0.7861, "num_input_tokens_seen": 37762888, "step": 65465 }, { "epoch": 9.75126601131963, "grad_norm": 0.22458526492118835, "learning_rate": 9.428482384750136e-08, "loss": 0.7959, "num_input_tokens_seen": 37765544, "step": 65470 }, { "epoch": 9.75201072386059, "grad_norm": 0.2037806510925293, "learning_rate": 9.372177838403162e-08, "loss": 0.7891, "num_input_tokens_seen": 37768520, "step": 65475 }, { "epoch": 9.752755436401548, "grad_norm": 0.22903984785079956, "learning_rate": 9.316041598557979e-08, "loss": 0.8049, "num_input_tokens_seen": 37771144, "step": 65480 }, { "epoch": 9.753500148942509, "grad_norm": 0.20575150847434998, "learning_rate": 9.260073669008496e-08, "loss": 0.7891, "num_input_tokens_seen": 37773928, "step": 65485 }, { "epoch": 9.754244861483468, "grad_norm": 0.25055715441703796, "learning_rate": 9.20427405353641e-08, "loss": 0.793, "num_input_tokens_seen": 37776840, "step": 65490 }, { "epoch": 9.754989574024426, "grad_norm": 0.20220187306404114, "learning_rate": 9.148642755912873e-08, "loss": 0.8199, "num_input_tokens_seen": 37779784, "step": 65495 }, { "epoch": 9.755734286565385, "grad_norm": 0.25774505734443665, "learning_rate": 9.093179779897099e-08, "loss": 0.802, "num_input_tokens_seen": 37782600, "step": 65500 }, { "epoch": 9.756478999106346, "grad_norm": 0.27478182315826416, "learning_rate": 9.037885129236645e-08, "loss": 0.796, "num_input_tokens_seen": 37785480, "step": 65505 }, { "epoch": 9.757223711647304, "grad_norm": 0.21926221251487732, "learning_rate": 8.982758807668523e-08, "loss": 0.7934, "num_input_tokens_seen": 37788456, "step": 65510 }, { "epoch": 9.757968424188263, "grad_norm": 0.21464794874191284, "learning_rate": 8.927800818917809e-08, "loss": 0.7851, "num_input_tokens_seen": 37791336, "step": 65515 }, { "epoch": 9.758713136729222, "grad_norm": 0.23271963000297546, "learning_rate": 8.873011166698475e-08, "loss": 0.7786, "num_input_tokens_seen": 37794344, "step": 65520 }, { "epoch": 9.759457849270182, "grad_norm": 0.2890450358390808, "learning_rate": 8.818389854712561e-08, "loss": 0.7795, "num_input_tokens_seen": 37797608, "step": 65525 }, { "epoch": 9.760202561811141, "grad_norm": 0.2358945906162262, "learning_rate": 8.763936886651558e-08, "loss": 0.8306, "num_input_tokens_seen": 37800296, "step": 65530 }, { "epoch": 9.7609472743521, "grad_norm": 0.3001296818256378, "learning_rate": 8.709652266195301e-08, "loss": 0.7769, "num_input_tokens_seen": 37803464, "step": 65535 }, { "epoch": 9.761691986893059, "grad_norm": 0.2254984825849533, "learning_rate": 8.655535997011688e-08, "loss": 0.808, "num_input_tokens_seen": 37806632, "step": 65540 }, { "epoch": 9.76243669943402, "grad_norm": 0.20029114186763763, "learning_rate": 8.601588082758073e-08, "loss": 0.8132, "num_input_tokens_seen": 37809384, "step": 65545 }, { "epoch": 9.763181411974978, "grad_norm": 0.3287881016731262, "learning_rate": 8.547808527079593e-08, "loss": 0.7898, "num_input_tokens_seen": 37812168, "step": 65550 }, { "epoch": 9.763926124515937, "grad_norm": 0.1828824132680893, "learning_rate": 8.494197333610843e-08, "loss": 0.7696, "num_input_tokens_seen": 37814920, "step": 65555 }, { "epoch": 9.764670837056896, "grad_norm": 0.21048098802566528, "learning_rate": 8.44075450597448e-08, "loss": 0.8139, "num_input_tokens_seen": 37817480, "step": 65560 }, { "epoch": 9.765415549597854, "grad_norm": 0.2236815243959427, "learning_rate": 8.38748004778206e-08, "loss": 0.8155, "num_input_tokens_seen": 37820296, "step": 65565 }, { "epoch": 9.766160262138815, "grad_norm": 0.168228879570961, "learning_rate": 8.33437396263348e-08, "loss": 0.7944, "num_input_tokens_seen": 37823144, "step": 65570 }, { "epoch": 9.766904974679774, "grad_norm": 0.28974640369415283, "learning_rate": 8.281436254117536e-08, "loss": 0.8056, "num_input_tokens_seen": 37826248, "step": 65575 }, { "epoch": 9.767649687220732, "grad_norm": 0.20621611177921295, "learning_rate": 8.228666925811646e-08, "loss": 0.8237, "num_input_tokens_seen": 37829128, "step": 65580 }, { "epoch": 9.768394399761693, "grad_norm": 0.17183442413806915, "learning_rate": 8.176065981281567e-08, "loss": 0.8153, "num_input_tokens_seen": 37831720, "step": 65585 }, { "epoch": 9.769139112302652, "grad_norm": 0.2132394164800644, "learning_rate": 8.123633424081956e-08, "loss": 0.8387, "num_input_tokens_seen": 37834664, "step": 65590 }, { "epoch": 9.76988382484361, "grad_norm": 0.35168883204460144, "learning_rate": 8.07136925775609e-08, "loss": 0.7925, "num_input_tokens_seen": 37837832, "step": 65595 }, { "epoch": 9.77062853738457, "grad_norm": 0.26165771484375, "learning_rate": 8.01927348583531e-08, "loss": 0.7942, "num_input_tokens_seen": 37841192, "step": 65600 }, { "epoch": 9.771373249925528, "grad_norm": 0.1691902130842209, "learning_rate": 7.96734611184069e-08, "loss": 0.8156, "num_input_tokens_seen": 37843880, "step": 65605 }, { "epoch": 9.772117962466488, "grad_norm": 0.1928829848766327, "learning_rate": 7.915587139280811e-08, "loss": 0.8141, "num_input_tokens_seen": 37846760, "step": 65610 }, { "epoch": 9.772862675007447, "grad_norm": 0.23101677000522614, "learning_rate": 7.863996571653431e-08, "loss": 0.8268, "num_input_tokens_seen": 37849320, "step": 65615 }, { "epoch": 9.773607387548406, "grad_norm": 0.19549202919006348, "learning_rate": 7.812574412444929e-08, "loss": 0.8073, "num_input_tokens_seen": 37851816, "step": 65620 }, { "epoch": 9.774352100089365, "grad_norm": 0.26661524176597595, "learning_rate": 7.761320665130301e-08, "loss": 0.7923, "num_input_tokens_seen": 37854600, "step": 65625 }, { "epoch": 9.775096812630325, "grad_norm": 0.23287807404994965, "learning_rate": 7.71023533317261e-08, "loss": 0.7872, "num_input_tokens_seen": 37857448, "step": 65630 }, { "epoch": 9.775841525171284, "grad_norm": 0.2168508917093277, "learning_rate": 7.659318420024653e-08, "loss": 0.792, "num_input_tokens_seen": 37860392, "step": 65635 }, { "epoch": 9.776586237712243, "grad_norm": 0.23228763043880463, "learning_rate": 7.60856992912673e-08, "loss": 0.8009, "num_input_tokens_seen": 37863752, "step": 65640 }, { "epoch": 9.777330950253202, "grad_norm": 0.24248337745666504, "learning_rate": 7.557989863908044e-08, "loss": 0.7929, "num_input_tokens_seen": 37866504, "step": 65645 }, { "epoch": 9.778075662794162, "grad_norm": 0.19300344586372375, "learning_rate": 7.507578227787249e-08, "loss": 0.8129, "num_input_tokens_seen": 37869640, "step": 65650 }, { "epoch": 9.77882037533512, "grad_norm": 0.2135007530450821, "learning_rate": 7.457335024170231e-08, "loss": 0.7755, "num_input_tokens_seen": 37872584, "step": 65655 }, { "epoch": 9.77956508787608, "grad_norm": 0.21617300808429718, "learning_rate": 7.407260256452885e-08, "loss": 0.8016, "num_input_tokens_seen": 37875592, "step": 65660 }, { "epoch": 9.780309800417038, "grad_norm": 0.20447608828544617, "learning_rate": 7.357353928018618e-08, "loss": 0.8111, "num_input_tokens_seen": 37878536, "step": 65665 }, { "epoch": 9.781054512957999, "grad_norm": 0.2525056302547455, "learning_rate": 7.307616042240007e-08, "loss": 0.7863, "num_input_tokens_seen": 37881288, "step": 65670 }, { "epoch": 9.781799225498958, "grad_norm": 0.22708852589130402, "learning_rate": 7.258046602478252e-08, "loss": 0.8055, "num_input_tokens_seen": 37884008, "step": 65675 }, { "epoch": 9.782543938039916, "grad_norm": 0.19082430005073547, "learning_rate": 7.208645612082899e-08, "loss": 0.7806, "num_input_tokens_seen": 37886888, "step": 65680 }, { "epoch": 9.783288650580875, "grad_norm": 0.2499960958957672, "learning_rate": 7.159413074392107e-08, "loss": 0.8006, "num_input_tokens_seen": 37889896, "step": 65685 }, { "epoch": 9.784033363121836, "grad_norm": 0.22963105142116547, "learning_rate": 7.110348992733217e-08, "loss": 0.8023, "num_input_tokens_seen": 37892840, "step": 65690 }, { "epoch": 9.784778075662794, "grad_norm": 0.1932002604007721, "learning_rate": 7.061453370421634e-08, "loss": 0.8336, "num_input_tokens_seen": 37895560, "step": 65695 }, { "epoch": 9.785522788203753, "grad_norm": 0.19583097100257874, "learning_rate": 7.012726210761656e-08, "loss": 0.8048, "num_input_tokens_seen": 37898120, "step": 65700 }, { "epoch": 9.786267500744712, "grad_norm": 0.1932033747434616, "learning_rate": 6.96416751704565e-08, "loss": 0.794, "num_input_tokens_seen": 37900968, "step": 65705 }, { "epoch": 9.787012213285673, "grad_norm": 0.283830463886261, "learning_rate": 6.915777292555159e-08, "loss": 0.8033, "num_input_tokens_seen": 37903912, "step": 65710 }, { "epoch": 9.787756925826631, "grad_norm": 0.28243330121040344, "learning_rate": 6.867555540560621e-08, "loss": 0.777, "num_input_tokens_seen": 37906920, "step": 65715 }, { "epoch": 9.78850163836759, "grad_norm": 0.2473343312740326, "learning_rate": 6.819502264319988e-08, "loss": 0.7989, "num_input_tokens_seen": 37909768, "step": 65720 }, { "epoch": 9.789246350908549, "grad_norm": 0.1785673052072525, "learning_rate": 6.771617467080938e-08, "loss": 0.7972, "num_input_tokens_seen": 37912648, "step": 65725 }, { "epoch": 9.78999106344951, "grad_norm": 0.17820772528648376, "learning_rate": 6.723901152079492e-08, "loss": 0.7803, "num_input_tokens_seen": 37915240, "step": 65730 }, { "epoch": 9.790735775990468, "grad_norm": 0.20170117914676666, "learning_rate": 6.676353322539741e-08, "loss": 0.7876, "num_input_tokens_seen": 37917928, "step": 65735 }, { "epoch": 9.791480488531427, "grad_norm": 0.18788158893585205, "learning_rate": 6.628973981674947e-08, "loss": 0.7825, "num_input_tokens_seen": 37921704, "step": 65740 }, { "epoch": 9.792225201072386, "grad_norm": 0.27557486295700073, "learning_rate": 6.581763132686714e-08, "loss": 0.7918, "num_input_tokens_seen": 37924424, "step": 65745 }, { "epoch": 9.792969913613344, "grad_norm": 0.22961874306201935, "learning_rate": 6.534720778765547e-08, "loss": 0.7877, "num_input_tokens_seen": 37927144, "step": 65750 }, { "epoch": 9.793714626154305, "grad_norm": 0.21713273227214813, "learning_rate": 6.487846923090012e-08, "loss": 0.7769, "num_input_tokens_seen": 37929800, "step": 65755 }, { "epoch": 9.794459338695264, "grad_norm": 0.22052940726280212, "learning_rate": 6.441141568828135e-08, "loss": 0.7722, "num_input_tokens_seen": 37932616, "step": 65760 }, { "epoch": 9.795204051236222, "grad_norm": 0.21484440565109253, "learning_rate": 6.394604719135722e-08, "loss": 0.7733, "num_input_tokens_seen": 37935432, "step": 65765 }, { "epoch": 9.795948763777183, "grad_norm": 0.2813796401023865, "learning_rate": 6.348236377157756e-08, "loss": 0.8101, "num_input_tokens_seen": 37938440, "step": 65770 }, { "epoch": 9.796693476318142, "grad_norm": 0.2733401358127594, "learning_rate": 6.30203654602729e-08, "loss": 0.7951, "num_input_tokens_seen": 37941384, "step": 65775 }, { "epoch": 9.7974381888591, "grad_norm": 0.2515677213668823, "learning_rate": 6.256005228866824e-08, "loss": 0.8113, "num_input_tokens_seen": 37944296, "step": 65780 }, { "epoch": 9.79818290140006, "grad_norm": 0.21083760261535645, "learning_rate": 6.210142428786647e-08, "loss": 0.7888, "num_input_tokens_seen": 37947240, "step": 65785 }, { "epoch": 9.798927613941018, "grad_norm": 0.2123948186635971, "learning_rate": 6.164448148885948e-08, "loss": 0.7858, "num_input_tokens_seen": 37950376, "step": 65790 }, { "epoch": 9.799672326481979, "grad_norm": 0.19786716997623444, "learning_rate": 6.118922392252813e-08, "loss": 0.7962, "num_input_tokens_seen": 37953416, "step": 65795 }, { "epoch": 9.800417039022937, "grad_norm": 0.2336268126964569, "learning_rate": 6.07356516196339e-08, "loss": 0.8244, "num_input_tokens_seen": 37956520, "step": 65800 }, { "epoch": 9.801161751563896, "grad_norm": 0.26393499970436096, "learning_rate": 6.028376461082729e-08, "loss": 0.792, "num_input_tokens_seen": 37959272, "step": 65805 }, { "epoch": 9.801906464104855, "grad_norm": 0.24486258625984192, "learning_rate": 5.983356292664776e-08, "loss": 0.8147, "num_input_tokens_seen": 37962088, "step": 65810 }, { "epoch": 9.802651176645815, "grad_norm": 0.1946437507867813, "learning_rate": 5.9385046597518204e-08, "loss": 0.8166, "num_input_tokens_seen": 37964680, "step": 65815 }, { "epoch": 9.803395889186774, "grad_norm": 0.1751215159893036, "learning_rate": 5.893821565374491e-08, "loss": 0.8068, "num_input_tokens_seen": 37967496, "step": 65820 }, { "epoch": 9.804140601727733, "grad_norm": 0.2134670913219452, "learning_rate": 5.8493070125523184e-08, "loss": 0.8077, "num_input_tokens_seen": 37970504, "step": 65825 }, { "epoch": 9.804885314268692, "grad_norm": 0.2609241306781769, "learning_rate": 5.804961004293452e-08, "loss": 0.8066, "num_input_tokens_seen": 37973288, "step": 65830 }, { "epoch": 9.805630026809652, "grad_norm": 0.2399100363254547, "learning_rate": 5.760783543594939e-08, "loss": 0.8043, "num_input_tokens_seen": 37976328, "step": 65835 }, { "epoch": 9.80637473935061, "grad_norm": 0.22559167444705963, "learning_rate": 5.716774633441613e-08, "loss": 0.7758, "num_input_tokens_seen": 37979304, "step": 65840 }, { "epoch": 9.80711945189157, "grad_norm": 0.1984913945198059, "learning_rate": 5.672934276807762e-08, "loss": 0.8295, "num_input_tokens_seen": 37982312, "step": 65845 }, { "epoch": 9.807864164432528, "grad_norm": 0.22260889410972595, "learning_rate": 5.629262476655739e-08, "loss": 0.7544, "num_input_tokens_seen": 37985768, "step": 65850 }, { "epoch": 9.808608876973489, "grad_norm": 0.2903110682964325, "learning_rate": 5.585759235936794e-08, "loss": 0.8106, "num_input_tokens_seen": 37988680, "step": 65855 }, { "epoch": 9.809353589514448, "grad_norm": 0.176306813955307, "learning_rate": 5.54242455759052e-08, "loss": 0.8295, "num_input_tokens_seen": 37991784, "step": 65860 }, { "epoch": 9.810098302055406, "grad_norm": 0.27359795570373535, "learning_rate": 5.499258444545685e-08, "loss": 0.8124, "num_input_tokens_seen": 37994472, "step": 65865 }, { "epoch": 9.810843014596365, "grad_norm": 0.23535698652267456, "learning_rate": 5.4562608997191234e-08, "loss": 0.8081, "num_input_tokens_seen": 37997320, "step": 65870 }, { "epoch": 9.811587727137326, "grad_norm": 0.21096505224704742, "learning_rate": 5.413431926016288e-08, "loss": 0.7978, "num_input_tokens_seen": 38000104, "step": 65875 }, { "epoch": 9.812332439678285, "grad_norm": 0.21234558522701263, "learning_rate": 5.3707715263315305e-08, "loss": 0.8041, "num_input_tokens_seen": 38002856, "step": 65880 }, { "epoch": 9.813077152219243, "grad_norm": 0.26220670342445374, "learning_rate": 5.328279703547545e-08, "loss": 0.8291, "num_input_tokens_seen": 38005928, "step": 65885 }, { "epoch": 9.813821864760202, "grad_norm": 0.20726843178272247, "learning_rate": 5.285956460535646e-08, "loss": 0.7729, "num_input_tokens_seen": 38008744, "step": 65890 }, { "epoch": 9.814566577301163, "grad_norm": 0.19467590749263763, "learning_rate": 5.243801800156323e-08, "loss": 0.7713, "num_input_tokens_seen": 38011624, "step": 65895 }, { "epoch": 9.815311289842121, "grad_norm": 0.21457652747631073, "learning_rate": 5.2018157252578525e-08, "loss": 0.7788, "num_input_tokens_seen": 38014408, "step": 65900 }, { "epoch": 9.81605600238308, "grad_norm": 0.15321782231330872, "learning_rate": 5.15999823867741e-08, "loss": 0.8077, "num_input_tokens_seen": 38017256, "step": 65905 }, { "epoch": 9.816800714924039, "grad_norm": 0.2419409304857254, "learning_rate": 5.118349343241069e-08, "loss": 0.7977, "num_input_tokens_seen": 38020200, "step": 65910 }, { "epoch": 9.817545427465, "grad_norm": 0.2573789060115814, "learning_rate": 5.076869041763521e-08, "loss": 0.8318, "num_input_tokens_seen": 38023400, "step": 65915 }, { "epoch": 9.818290140005958, "grad_norm": 0.19824016094207764, "learning_rate": 5.0355573370472475e-08, "loss": 0.7965, "num_input_tokens_seen": 38026312, "step": 65920 }, { "epoch": 9.819034852546917, "grad_norm": 0.3341398239135742, "learning_rate": 4.9944142318841816e-08, "loss": 0.8129, "num_input_tokens_seen": 38029448, "step": 65925 }, { "epoch": 9.819779565087876, "grad_norm": 0.29487645626068115, "learning_rate": 4.953439729054876e-08, "loss": 0.8131, "num_input_tokens_seen": 38032200, "step": 65930 }, { "epoch": 9.820524277628834, "grad_norm": 0.23268146812915802, "learning_rate": 4.9126338313279504e-08, "loss": 0.7874, "num_input_tokens_seen": 38035208, "step": 65935 }, { "epoch": 9.821268990169795, "grad_norm": 0.22793522477149963, "learning_rate": 4.8719965414606436e-08, "loss": 0.8402, "num_input_tokens_seen": 38038088, "step": 65940 }, { "epoch": 9.822013702710754, "grad_norm": 0.21563008427619934, "learning_rate": 4.831527862199647e-08, "loss": 0.8159, "num_input_tokens_seen": 38040616, "step": 65945 }, { "epoch": 9.822758415251712, "grad_norm": 0.21870248019695282, "learning_rate": 4.791227796279163e-08, "loss": 0.7938, "num_input_tokens_seen": 38043464, "step": 65950 }, { "epoch": 9.823503127792671, "grad_norm": 0.19999970495700836, "learning_rate": 4.751096346423123e-08, "loss": 0.7822, "num_input_tokens_seen": 38046376, "step": 65955 }, { "epoch": 9.824247840333632, "grad_norm": 0.2249356508255005, "learning_rate": 4.711133515342692e-08, "loss": 0.808, "num_input_tokens_seen": 38049320, "step": 65960 }, { "epoch": 9.82499255287459, "grad_norm": 0.16803768277168274, "learning_rate": 4.6713393057387646e-08, "loss": 0.7965, "num_input_tokens_seen": 38052072, "step": 65965 }, { "epoch": 9.82573726541555, "grad_norm": 0.2255750447511673, "learning_rate": 4.631713720300856e-08, "loss": 0.7969, "num_input_tokens_seen": 38054952, "step": 65970 }, { "epoch": 9.826481977956508, "grad_norm": 0.17278937995433807, "learning_rate": 4.592256761705993e-08, "loss": 0.8009, "num_input_tokens_seen": 38057832, "step": 65975 }, { "epoch": 9.827226690497469, "grad_norm": 0.22374872863292694, "learning_rate": 4.5529684326206526e-08, "loss": 0.7909, "num_input_tokens_seen": 38060744, "step": 65980 }, { "epoch": 9.827971403038427, "grad_norm": 0.19098158180713654, "learning_rate": 4.51384873570021e-08, "loss": 0.7981, "num_input_tokens_seen": 38063432, "step": 65985 }, { "epoch": 9.828716115579386, "grad_norm": 0.15503621101379395, "learning_rate": 4.47489767358783e-08, "loss": 0.7909, "num_input_tokens_seen": 38066120, "step": 65990 }, { "epoch": 9.829460828120345, "grad_norm": 0.19022050499916077, "learning_rate": 4.436115248915851e-08, "loss": 0.7888, "num_input_tokens_seen": 38068776, "step": 65995 }, { "epoch": 9.830205540661305, "grad_norm": 0.23947761952877045, "learning_rate": 4.397501464304954e-08, "loss": 0.7852, "num_input_tokens_seen": 38071816, "step": 66000 }, { "epoch": 9.830950253202264, "grad_norm": 0.226179838180542, "learning_rate": 4.3590563223647184e-08, "loss": 0.7803, "num_input_tokens_seen": 38074536, "step": 66005 }, { "epoch": 9.831694965743223, "grad_norm": 0.24866098165512085, "learning_rate": 4.320779825692789e-08, "loss": 0.7826, "num_input_tokens_seen": 38077224, "step": 66010 }, { "epoch": 9.832439678284182, "grad_norm": 0.18496482074260712, "learning_rate": 4.2826719768757074e-08, "loss": 0.7772, "num_input_tokens_seen": 38080168, "step": 66015 }, { "epoch": 9.833184390825142, "grad_norm": 0.24947795271873474, "learning_rate": 4.244732778489191e-08, "loss": 0.7957, "num_input_tokens_seen": 38083176, "step": 66020 }, { "epoch": 9.833929103366101, "grad_norm": 0.26025819778442383, "learning_rate": 4.20696223309619e-08, "loss": 0.7827, "num_input_tokens_seen": 38086120, "step": 66025 }, { "epoch": 9.83467381590706, "grad_norm": 0.35198789834976196, "learning_rate": 4.1693603432499396e-08, "loss": 0.7893, "num_input_tokens_seen": 38089128, "step": 66030 }, { "epoch": 9.835418528448018, "grad_norm": 0.18298056721687317, "learning_rate": 4.13192711149063e-08, "loss": 0.783, "num_input_tokens_seen": 38092040, "step": 66035 }, { "epoch": 9.836163240988979, "grad_norm": 0.1904418021440506, "learning_rate": 4.0946625403484593e-08, "loss": 0.7901, "num_input_tokens_seen": 38094856, "step": 66040 }, { "epoch": 9.836907953529938, "grad_norm": 0.28466981649398804, "learning_rate": 4.057566632341414e-08, "loss": 0.8012, "num_input_tokens_seen": 38097800, "step": 66045 }, { "epoch": 9.837652666070897, "grad_norm": 0.32767918705940247, "learning_rate": 4.0206393899761e-08, "loss": 0.7801, "num_input_tokens_seen": 38100808, "step": 66050 }, { "epoch": 9.838397378611855, "grad_norm": 0.35380205512046814, "learning_rate": 3.98388081574802e-08, "loss": 0.7995, "num_input_tokens_seen": 38103784, "step": 66055 }, { "epoch": 9.839142091152816, "grad_norm": 0.21623772382736206, "learning_rate": 3.9472909121412994e-08, "loss": 0.8097, "num_input_tokens_seen": 38106664, "step": 66060 }, { "epoch": 9.839886803693775, "grad_norm": 0.16335487365722656, "learning_rate": 3.910869681628404e-08, "loss": 0.7927, "num_input_tokens_seen": 38109672, "step": 66065 }, { "epoch": 9.840631516234733, "grad_norm": 0.20119403302669525, "learning_rate": 3.8746171266706985e-08, "loss": 0.8182, "num_input_tokens_seen": 38112392, "step": 66070 }, { "epoch": 9.841376228775692, "grad_norm": 0.23192232847213745, "learning_rate": 3.838533249717891e-08, "loss": 0.7941, "num_input_tokens_seen": 38115400, "step": 66075 }, { "epoch": 9.842120941316653, "grad_norm": 0.18880704045295715, "learning_rate": 3.8026180532083066e-08, "loss": 0.8056, "num_input_tokens_seen": 38118504, "step": 66080 }, { "epoch": 9.842865653857611, "grad_norm": 0.2424468994140625, "learning_rate": 3.766871539568895e-08, "loss": 0.8004, "num_input_tokens_seen": 38121352, "step": 66085 }, { "epoch": 9.84361036639857, "grad_norm": 0.21910083293914795, "learning_rate": 3.7312937112152226e-08, "loss": 0.8284, "num_input_tokens_seen": 38124200, "step": 66090 }, { "epoch": 9.844355078939529, "grad_norm": 0.3508121967315674, "learning_rate": 3.695884570552033e-08, "loss": 0.7842, "num_input_tokens_seen": 38127112, "step": 66095 }, { "epoch": 9.84509979148049, "grad_norm": 0.18743082880973816, "learning_rate": 3.660644119971579e-08, "loss": 0.8061, "num_input_tokens_seen": 38130088, "step": 66100 }, { "epoch": 9.845844504021448, "grad_norm": 0.23024387657642365, "learning_rate": 3.6255723618552895e-08, "loss": 0.7944, "num_input_tokens_seen": 38132872, "step": 66105 }, { "epoch": 9.846589216562407, "grad_norm": 0.19910350441932678, "learning_rate": 3.5906692985732124e-08, "loss": 0.8204, "num_input_tokens_seen": 38135752, "step": 66110 }, { "epoch": 9.847333929103366, "grad_norm": 0.21627220511436462, "learning_rate": 3.555934932484295e-08, "loss": 0.8061, "num_input_tokens_seen": 38138728, "step": 66115 }, { "epoch": 9.848078641644324, "grad_norm": 0.2130144089460373, "learning_rate": 3.521369265935548e-08, "loss": 0.7981, "num_input_tokens_seen": 38141672, "step": 66120 }, { "epoch": 9.848823354185285, "grad_norm": 0.22098982334136963, "learning_rate": 3.4869723012623254e-08, "loss": 0.8196, "num_input_tokens_seen": 38144712, "step": 66125 }, { "epoch": 9.849568066726244, "grad_norm": 0.26350510120391846, "learning_rate": 3.452744040789713e-08, "loss": 0.7939, "num_input_tokens_seen": 38147816, "step": 66130 }, { "epoch": 9.850312779267203, "grad_norm": 0.1999792456626892, "learning_rate": 3.418684486830581e-08, "loss": 0.7724, "num_input_tokens_seen": 38150504, "step": 66135 }, { "epoch": 9.851057491808161, "grad_norm": 0.26501837372779846, "learning_rate": 3.384793641686146e-08, "loss": 0.8035, "num_input_tokens_seen": 38153288, "step": 66140 }, { "epoch": 9.851802204349122, "grad_norm": 0.24420976638793945, "learning_rate": 3.351071507646797e-08, "loss": 0.7744, "num_input_tokens_seen": 38156296, "step": 66145 }, { "epoch": 9.85254691689008, "grad_norm": 0.2005496323108673, "learning_rate": 3.3175180869915445e-08, "loss": 0.7875, "num_input_tokens_seen": 38159240, "step": 66150 }, { "epoch": 9.85329162943104, "grad_norm": 0.16047702729701996, "learning_rate": 3.2841333819877415e-08, "loss": 0.7998, "num_input_tokens_seen": 38162440, "step": 66155 }, { "epoch": 9.854036341971998, "grad_norm": 0.16835297644138336, "learning_rate": 3.250917394891084e-08, "loss": 0.824, "num_input_tokens_seen": 38165448, "step": 66160 }, { "epoch": 9.854781054512959, "grad_norm": 0.19236692786216736, "learning_rate": 3.2178701279464426e-08, "loss": 0.7614, "num_input_tokens_seen": 38168552, "step": 66165 }, { "epoch": 9.855525767053917, "grad_norm": 0.19654838740825653, "learning_rate": 3.1849915833870313e-08, "loss": 0.7978, "num_input_tokens_seen": 38171496, "step": 66170 }, { "epoch": 9.856270479594876, "grad_norm": 0.24145066738128662, "learning_rate": 3.1522817634346834e-08, "loss": 0.8082, "num_input_tokens_seen": 38174248, "step": 66175 }, { "epoch": 9.857015192135835, "grad_norm": 0.23723503947257996, "learning_rate": 3.119740670299576e-08, "loss": 0.8033, "num_input_tokens_seen": 38177000, "step": 66180 }, { "epoch": 9.857759904676795, "grad_norm": 0.18541553616523743, "learning_rate": 3.0873683061807826e-08, "loss": 0.8121, "num_input_tokens_seen": 38180104, "step": 66185 }, { "epoch": 9.858504617217754, "grad_norm": 0.20059345662593842, "learning_rate": 3.0551646732659975e-08, "loss": 0.775, "num_input_tokens_seen": 38183144, "step": 66190 }, { "epoch": 9.859249329758713, "grad_norm": 0.2215508371591568, "learning_rate": 3.0231297737312594e-08, "loss": 0.7869, "num_input_tokens_seen": 38185864, "step": 66195 }, { "epoch": 9.859994042299672, "grad_norm": 0.36794906854629517, "learning_rate": 2.991263609741502e-08, "loss": 0.8113, "num_input_tokens_seen": 38189032, "step": 66200 }, { "epoch": 9.860738754840632, "grad_norm": 0.2763875126838684, "learning_rate": 2.9595661834500023e-08, "loss": 0.7931, "num_input_tokens_seen": 38191944, "step": 66205 }, { "epoch": 9.861483467381591, "grad_norm": 0.21386078000068665, "learning_rate": 2.9280374969989366e-08, "loss": 0.764, "num_input_tokens_seen": 38194632, "step": 66210 }, { "epoch": 9.86222817992255, "grad_norm": 0.22564814984798431, "learning_rate": 2.8966775525185453e-08, "loss": 0.8093, "num_input_tokens_seen": 38197576, "step": 66215 }, { "epoch": 9.862972892463509, "grad_norm": 0.22294533252716064, "learning_rate": 2.865486352128244e-08, "loss": 0.7931, "num_input_tokens_seen": 38200520, "step": 66220 }, { "epoch": 9.863717605004469, "grad_norm": 0.25003641843795776, "learning_rate": 2.8344638979357907e-08, "loss": 0.7888, "num_input_tokens_seen": 38203496, "step": 66225 }, { "epoch": 9.864462317545428, "grad_norm": 0.22588691115379333, "learning_rate": 2.8036101920375647e-08, "loss": 0.7941, "num_input_tokens_seen": 38206312, "step": 66230 }, { "epoch": 9.865207030086387, "grad_norm": 0.30206185579299927, "learning_rate": 2.7729252365185643e-08, "loss": 0.7886, "num_input_tokens_seen": 38209064, "step": 66235 }, { "epoch": 9.865951742627345, "grad_norm": 0.17527808248996735, "learning_rate": 2.7424090334521313e-08, "loss": 0.8249, "num_input_tokens_seen": 38211848, "step": 66240 }, { "epoch": 9.866696455168306, "grad_norm": 0.27646762132644653, "learning_rate": 2.7120615849007826e-08, "loss": 0.8269, "num_input_tokens_seen": 38214536, "step": 66245 }, { "epoch": 9.867441167709265, "grad_norm": 0.1803463250398636, "learning_rate": 2.681882892914822e-08, "loss": 0.7689, "num_input_tokens_seen": 38217608, "step": 66250 }, { "epoch": 9.868185880250223, "grad_norm": 0.2939593493938446, "learning_rate": 2.6518729595340075e-08, "loss": 0.8081, "num_input_tokens_seen": 38220520, "step": 66255 }, { "epoch": 9.868930592791182, "grad_norm": 0.23560866713523865, "learning_rate": 2.622031786786161e-08, "loss": 0.8138, "num_input_tokens_seen": 38223368, "step": 66260 }, { "epoch": 9.86967530533214, "grad_norm": 0.22415319085121155, "learning_rate": 2.5923593766880026e-08, "loss": 0.8228, "num_input_tokens_seen": 38226024, "step": 66265 }, { "epoch": 9.870420017873101, "grad_norm": 0.3016575276851654, "learning_rate": 2.5628557312440403e-08, "loss": 0.8204, "num_input_tokens_seen": 38229032, "step": 66270 }, { "epoch": 9.87116473041406, "grad_norm": 0.249882772564888, "learning_rate": 2.5335208524487898e-08, "loss": 0.7855, "num_input_tokens_seen": 38231880, "step": 66275 }, { "epoch": 9.871909442955019, "grad_norm": 0.18612787127494812, "learning_rate": 2.5043547422839986e-08, "loss": 0.7852, "num_input_tokens_seen": 38234760, "step": 66280 }, { "epoch": 9.87265415549598, "grad_norm": 0.254217267036438, "learning_rate": 2.4753574027211457e-08, "loss": 0.8158, "num_input_tokens_seen": 38237448, "step": 66285 }, { "epoch": 9.873398868036938, "grad_norm": 0.18226619064807892, "learning_rate": 2.4465288357192196e-08, "loss": 0.8147, "num_input_tokens_seen": 38240104, "step": 66290 }, { "epoch": 9.874143580577897, "grad_norm": 0.17085877060890198, "learning_rate": 2.4178690432266617e-08, "loss": 0.7572, "num_input_tokens_seen": 38243048, "step": 66295 }, { "epoch": 9.874888293118856, "grad_norm": 0.2021293193101883, "learning_rate": 2.389378027179978e-08, "loss": 0.8048, "num_input_tokens_seen": 38245960, "step": 66300 }, { "epoch": 9.875633005659815, "grad_norm": 0.29072341322898865, "learning_rate": 2.3610557895045736e-08, "loss": 0.8068, "num_input_tokens_seen": 38248776, "step": 66305 }, { "epoch": 9.876377718200775, "grad_norm": 0.21282990276813507, "learning_rate": 2.3329023321144727e-08, "loss": 0.8107, "num_input_tokens_seen": 38251880, "step": 66310 }, { "epoch": 9.877122430741734, "grad_norm": 0.22281411290168762, "learning_rate": 2.304917656912042e-08, "loss": 0.799, "num_input_tokens_seen": 38254696, "step": 66315 }, { "epoch": 9.877867143282693, "grad_norm": 0.1938464343547821, "learning_rate": 2.27710176578827e-08, "loss": 0.7827, "num_input_tokens_seen": 38257640, "step": 66320 }, { "epoch": 9.878611855823651, "grad_norm": 0.1804664582014084, "learning_rate": 2.2494546606230405e-08, "loss": 0.7831, "num_input_tokens_seen": 38260680, "step": 66325 }, { "epoch": 9.879356568364612, "grad_norm": 0.2787172198295593, "learning_rate": 2.221976343284582e-08, "loss": 0.8074, "num_input_tokens_seen": 38263464, "step": 66330 }, { "epoch": 9.88010128090557, "grad_norm": 0.21160630881786346, "learning_rate": 2.1946668156297422e-08, "loss": 0.7921, "num_input_tokens_seen": 38266408, "step": 66335 }, { "epoch": 9.88084599344653, "grad_norm": 0.21495181322097778, "learning_rate": 2.1675260795037122e-08, "loss": 0.7727, "num_input_tokens_seen": 38269480, "step": 66340 }, { "epoch": 9.881590705987488, "grad_norm": 0.2639269232749939, "learning_rate": 2.1405541367411353e-08, "loss": 0.792, "num_input_tokens_seen": 38272168, "step": 66345 }, { "epoch": 9.882335418528449, "grad_norm": 0.15327970683574677, "learning_rate": 2.113750989164165e-08, "loss": 0.8278, "num_input_tokens_seen": 38275304, "step": 66350 }, { "epoch": 9.883080131069407, "grad_norm": 0.21602550148963928, "learning_rate": 2.0871166385844077e-08, "loss": 0.8201, "num_input_tokens_seen": 38278344, "step": 66355 }, { "epoch": 9.883824843610366, "grad_norm": 0.21395540237426758, "learning_rate": 2.060651086801535e-08, "loss": 0.7855, "num_input_tokens_seen": 38281160, "step": 66360 }, { "epoch": 9.884569556151325, "grad_norm": 0.18879760801792145, "learning_rate": 2.0343543356038385e-08, "loss": 0.7701, "num_input_tokens_seen": 38284264, "step": 66365 }, { "epoch": 9.885314268692285, "grad_norm": 0.32680410146713257, "learning_rate": 2.008226386768508e-08, "loss": 0.814, "num_input_tokens_seen": 38287272, "step": 66370 }, { "epoch": 9.886058981233244, "grad_norm": 0.2949073612689972, "learning_rate": 1.982267242061353e-08, "loss": 0.7581, "num_input_tokens_seen": 38290248, "step": 66375 }, { "epoch": 9.886803693774203, "grad_norm": 0.24278651177883148, "learning_rate": 1.9564769032362485e-08, "loss": 0.7722, "num_input_tokens_seen": 38292936, "step": 66380 }, { "epoch": 9.887548406315162, "grad_norm": 0.2019096165895462, "learning_rate": 1.9308553720359667e-08, "loss": 0.8009, "num_input_tokens_seen": 38296008, "step": 66385 }, { "epoch": 9.888293118856122, "grad_norm": 0.2615456283092499, "learning_rate": 1.9054026501921785e-08, "loss": 0.8427, "num_input_tokens_seen": 38298888, "step": 66390 }, { "epoch": 9.889037831397081, "grad_norm": 0.21718168258666992, "learning_rate": 1.8801187394248965e-08, "loss": 0.7682, "num_input_tokens_seen": 38301640, "step": 66395 }, { "epoch": 9.88978254393804, "grad_norm": 0.2148987501859665, "learning_rate": 1.8550036414424765e-08, "loss": 0.8143, "num_input_tokens_seen": 38304488, "step": 66400 }, { "epoch": 9.890527256478999, "grad_norm": 0.2275494635105133, "learning_rate": 1.830057357942172e-08, "loss": 0.8264, "num_input_tokens_seen": 38307272, "step": 66405 }, { "epoch": 9.891271969019959, "grad_norm": 0.24601367115974426, "learning_rate": 1.8052798906098568e-08, "loss": 0.8089, "num_input_tokens_seen": 38309896, "step": 66410 }, { "epoch": 9.892016681560918, "grad_norm": 0.21828094124794006, "learning_rate": 1.780671241119469e-08, "loss": 0.7742, "num_input_tokens_seen": 38312584, "step": 66415 }, { "epoch": 9.892761394101877, "grad_norm": 0.2683950662612915, "learning_rate": 1.756231411134679e-08, "loss": 0.7722, "num_input_tokens_seen": 38315656, "step": 66420 }, { "epoch": 9.893506106642835, "grad_norm": 0.1795016974210739, "learning_rate": 1.7319604023066648e-08, "loss": 0.7976, "num_input_tokens_seen": 38318664, "step": 66425 }, { "epoch": 9.894250819183796, "grad_norm": 0.28307291865348816, "learning_rate": 1.7078582162752265e-08, "loss": 0.7899, "num_input_tokens_seen": 38321544, "step": 66430 }, { "epoch": 9.894995531724755, "grad_norm": 0.24975177645683289, "learning_rate": 1.6839248546696163e-08, "loss": 0.7714, "num_input_tokens_seen": 38324456, "step": 66435 }, { "epoch": 9.895740244265713, "grad_norm": 0.21237905323505402, "learning_rate": 1.6601603191071513e-08, "loss": 0.7997, "num_input_tokens_seen": 38327464, "step": 66440 }, { "epoch": 9.896484956806672, "grad_norm": 0.21782754361629486, "learning_rate": 1.6365646111932144e-08, "loss": 0.7899, "num_input_tokens_seen": 38330312, "step": 66445 }, { "epoch": 9.897229669347631, "grad_norm": 0.29611915349960327, "learning_rate": 1.6131377325226405e-08, "loss": 0.7806, "num_input_tokens_seen": 38333160, "step": 66450 }, { "epoch": 9.897974381888591, "grad_norm": 0.1665499359369278, "learning_rate": 1.589879684678608e-08, "loss": 0.7927, "num_input_tokens_seen": 38335944, "step": 66455 }, { "epoch": 9.89871909442955, "grad_norm": 0.2145588994026184, "learning_rate": 1.566790469232915e-08, "loss": 0.8341, "num_input_tokens_seen": 38339016, "step": 66460 }, { "epoch": 9.899463806970509, "grad_norm": 0.25137054920196533, "learning_rate": 1.543870087745425e-08, "loss": 0.8041, "num_input_tokens_seen": 38341992, "step": 66465 }, { "epoch": 9.900208519511468, "grad_norm": 0.17690271139144897, "learning_rate": 1.5211185417651762e-08, "loss": 0.7933, "num_input_tokens_seen": 38344712, "step": 66470 }, { "epoch": 9.900953232052428, "grad_norm": 0.2834147810935974, "learning_rate": 1.4985358328298284e-08, "loss": 0.7963, "num_input_tokens_seen": 38347464, "step": 66475 }, { "epoch": 9.901697944593387, "grad_norm": 0.21619556844234467, "learning_rate": 1.4761219624651045e-08, "loss": 0.8196, "num_input_tokens_seen": 38350632, "step": 66480 }, { "epoch": 9.902442657134346, "grad_norm": 0.34353646636009216, "learning_rate": 1.4538769321859046e-08, "loss": 0.7946, "num_input_tokens_seen": 38353448, "step": 66485 }, { "epoch": 9.903187369675305, "grad_norm": 0.20591993629932404, "learning_rate": 1.431800743495193e-08, "loss": 0.8045, "num_input_tokens_seen": 38356072, "step": 66490 }, { "epoch": 9.903932082216265, "grad_norm": 0.19420157372951508, "learning_rate": 1.4098933978851093e-08, "loss": 0.8057, "num_input_tokens_seen": 38358984, "step": 66495 }, { "epoch": 9.904676794757224, "grad_norm": 0.1788441389799118, "learning_rate": 1.3881548968358581e-08, "loss": 0.7912, "num_input_tokens_seen": 38361736, "step": 66500 }, { "epoch": 9.905421507298183, "grad_norm": 0.2953476905822754, "learning_rate": 1.3665852418165426e-08, "loss": 0.7803, "num_input_tokens_seen": 38364584, "step": 66505 }, { "epoch": 9.906166219839141, "grad_norm": 0.1434987187385559, "learning_rate": 1.3451844342846077e-08, "loss": 0.8107, "num_input_tokens_seen": 38367496, "step": 66510 }, { "epoch": 9.906910932380102, "grad_norm": 0.2916922867298126, "learning_rate": 1.3239524756863964e-08, "loss": 0.7843, "num_input_tokens_seen": 38370536, "step": 66515 }, { "epoch": 9.90765564492106, "grad_norm": 0.259696900844574, "learning_rate": 1.302889367456317e-08, "loss": 0.8197, "num_input_tokens_seen": 38373224, "step": 66520 }, { "epoch": 9.90840035746202, "grad_norm": 0.19042232632637024, "learning_rate": 1.2819951110182305e-08, "loss": 0.7821, "num_input_tokens_seen": 38376104, "step": 66525 }, { "epoch": 9.909145070002978, "grad_norm": 0.23977376520633698, "learning_rate": 1.261269707784063e-08, "loss": 0.8062, "num_input_tokens_seen": 38378888, "step": 66530 }, { "epoch": 9.909889782543939, "grad_norm": 0.2870892286300659, "learning_rate": 1.2407131591538057e-08, "loss": 0.7807, "num_input_tokens_seen": 38381832, "step": 66535 }, { "epoch": 9.910634495084897, "grad_norm": 0.19449730217456818, "learning_rate": 1.2203254665171804e-08, "loss": 0.8003, "num_input_tokens_seen": 38384744, "step": 66540 }, { "epoch": 9.911379207625856, "grad_norm": 0.24197839200496674, "learning_rate": 1.2001066312516962e-08, "loss": 0.7923, "num_input_tokens_seen": 38387816, "step": 66545 }, { "epoch": 9.912123920166815, "grad_norm": 0.20159830152988434, "learning_rate": 1.1800566547234825e-08, "loss": 0.8002, "num_input_tokens_seen": 38390696, "step": 66550 }, { "epoch": 9.912868632707776, "grad_norm": 0.20604372024536133, "learning_rate": 1.1601755382875667e-08, "loss": 0.7753, "num_input_tokens_seen": 38393224, "step": 66555 }, { "epoch": 9.913613345248734, "grad_norm": 0.2601601779460907, "learning_rate": 1.1404632832873186e-08, "loss": 0.791, "num_input_tokens_seen": 38395944, "step": 66560 }, { "epoch": 9.914358057789693, "grad_norm": 0.20297881960868835, "learning_rate": 1.1209198910552831e-08, "loss": 0.8416, "num_input_tokens_seen": 38398792, "step": 66565 }, { "epoch": 9.915102770330652, "grad_norm": 0.21171173453330994, "learning_rate": 1.1015453629115158e-08, "loss": 0.785, "num_input_tokens_seen": 38401736, "step": 66570 }, { "epoch": 9.915847482871612, "grad_norm": 0.22979623079299927, "learning_rate": 1.0823397001655244e-08, "loss": 0.8066, "num_input_tokens_seen": 38404584, "step": 66575 }, { "epoch": 9.916592195412571, "grad_norm": 0.4066873788833618, "learning_rate": 1.0633029041154374e-08, "loss": 0.7965, "num_input_tokens_seen": 38407496, "step": 66580 }, { "epoch": 9.91733690795353, "grad_norm": 0.21415086090564728, "learning_rate": 1.0444349760471705e-08, "loss": 0.7801, "num_input_tokens_seen": 38410152, "step": 66585 }, { "epoch": 9.918081620494489, "grad_norm": 0.20327448844909668, "learning_rate": 1.0257359172360925e-08, "loss": 0.8164, "num_input_tokens_seen": 38412904, "step": 66590 }, { "epoch": 9.91882633303545, "grad_norm": 0.24525104463100433, "learning_rate": 1.0072057289456371e-08, "loss": 0.7758, "num_input_tokens_seen": 38416008, "step": 66595 }, { "epoch": 9.919571045576408, "grad_norm": 0.14702096581459045, "learning_rate": 9.888444124278585e-09, "loss": 0.7693, "num_input_tokens_seen": 38418952, "step": 66600 }, { "epoch": 9.920315758117367, "grad_norm": 0.13644681870937347, "learning_rate": 9.706519689239857e-09, "loss": 0.8058, "num_input_tokens_seen": 38421672, "step": 66605 }, { "epoch": 9.921060470658325, "grad_norm": 0.2194325476884842, "learning_rate": 9.526283996630359e-09, "loss": 0.7967, "num_input_tokens_seen": 38425000, "step": 66610 }, { "epoch": 9.921805183199286, "grad_norm": 0.162185937166214, "learning_rate": 9.347737058632012e-09, "loss": 0.7877, "num_input_tokens_seen": 38427688, "step": 66615 }, { "epoch": 9.922549895740245, "grad_norm": 0.32983556389808655, "learning_rate": 9.170878887307388e-09, "loss": 0.7928, "num_input_tokens_seen": 38430600, "step": 66620 }, { "epoch": 9.923294608281203, "grad_norm": 0.21309323608875275, "learning_rate": 8.995709494610816e-09, "loss": 0.8048, "num_input_tokens_seen": 38433448, "step": 66625 }, { "epoch": 9.924039320822162, "grad_norm": 0.27649247646331787, "learning_rate": 8.82222889237727e-09, "loss": 0.8148, "num_input_tokens_seen": 38436264, "step": 66630 }, { "epoch": 9.924784033363121, "grad_norm": 0.21339698135852814, "learning_rate": 8.65043709233071e-09, "loss": 0.8152, "num_input_tokens_seen": 38438984, "step": 66635 }, { "epoch": 9.925528745904082, "grad_norm": 0.18287383019924164, "learning_rate": 8.480334106081289e-09, "loss": 0.7889, "num_input_tokens_seen": 38441544, "step": 66640 }, { "epoch": 9.92627345844504, "grad_norm": 0.22673314809799194, "learning_rate": 8.311919945119817e-09, "loss": 0.805, "num_input_tokens_seen": 38444456, "step": 66645 }, { "epoch": 9.927018170985999, "grad_norm": 0.19330409169197083, "learning_rate": 8.145194620834407e-09, "loss": 0.7856, "num_input_tokens_seen": 38447176, "step": 66650 }, { "epoch": 9.927762883526958, "grad_norm": 0.2118837684392929, "learning_rate": 7.98015814448272e-09, "loss": 0.7757, "num_input_tokens_seen": 38450408, "step": 66655 }, { "epoch": 9.928507596067918, "grad_norm": 0.20132380723953247, "learning_rate": 7.81681052722527e-09, "loss": 0.791, "num_input_tokens_seen": 38453480, "step": 66660 }, { "epoch": 9.929252308608877, "grad_norm": 0.19410252571105957, "learning_rate": 7.655151780094905e-09, "loss": 0.8028, "num_input_tokens_seen": 38456456, "step": 66665 }, { "epoch": 9.929997021149836, "grad_norm": 0.28690433502197266, "learning_rate": 7.49518191401899e-09, "loss": 0.802, "num_input_tokens_seen": 38459208, "step": 66670 }, { "epoch": 9.930741733690795, "grad_norm": 0.21321386098861694, "learning_rate": 7.336900939805546e-09, "loss": 0.779, "num_input_tokens_seen": 38461960, "step": 66675 }, { "epoch": 9.931486446231755, "grad_norm": 0.21648430824279785, "learning_rate": 7.180308868154351e-09, "loss": 0.8111, "num_input_tokens_seen": 38464904, "step": 66680 }, { "epoch": 9.932231158772714, "grad_norm": 0.23629410564899445, "learning_rate": 7.025405709640276e-09, "loss": 0.7864, "num_input_tokens_seen": 38467688, "step": 66685 }, { "epoch": 9.932975871313673, "grad_norm": 0.2147013247013092, "learning_rate": 6.8721914747382764e-09, "loss": 0.7847, "num_input_tokens_seen": 38470504, "step": 66690 }, { "epoch": 9.933720583854631, "grad_norm": 0.22344526648521423, "learning_rate": 6.720666173798407e-09, "loss": 0.7649, "num_input_tokens_seen": 38473736, "step": 66695 }, { "epoch": 9.934465296395592, "grad_norm": 0.195779949426651, "learning_rate": 6.570829817059699e-09, "loss": 0.8098, "num_input_tokens_seen": 38476776, "step": 66700 }, { "epoch": 9.93521000893655, "grad_norm": 0.2802709937095642, "learning_rate": 6.422682414650161e-09, "loss": 0.8045, "num_input_tokens_seen": 38479720, "step": 66705 }, { "epoch": 9.93595472147751, "grad_norm": 0.2557414472103119, "learning_rate": 6.276223976578455e-09, "loss": 0.793, "num_input_tokens_seen": 38482568, "step": 66710 }, { "epoch": 9.936699434018468, "grad_norm": 0.3031395673751831, "learning_rate": 6.131454512742218e-09, "loss": 0.79, "num_input_tokens_seen": 38485448, "step": 66715 }, { "epoch": 9.937444146559429, "grad_norm": 0.2197691798210144, "learning_rate": 5.988374032922517e-09, "loss": 0.7693, "num_input_tokens_seen": 38488200, "step": 66720 }, { "epoch": 9.938188859100388, "grad_norm": 0.25775253772735596, "learning_rate": 5.846982546792168e-09, "loss": 0.8102, "num_input_tokens_seen": 38490952, "step": 66725 }, { "epoch": 9.938933571641346, "grad_norm": 0.2424839437007904, "learning_rate": 5.707280063904641e-09, "loss": 0.8023, "num_input_tokens_seen": 38493736, "step": 66730 }, { "epoch": 9.939678284182305, "grad_norm": 0.33292001485824585, "learning_rate": 5.569266593699607e-09, "loss": 0.8025, "num_input_tokens_seen": 38497064, "step": 66735 }, { "epoch": 9.940422996723266, "grad_norm": 0.17346373200416565, "learning_rate": 5.4329421455029395e-09, "loss": 0.7812, "num_input_tokens_seen": 38499976, "step": 66740 }, { "epoch": 9.941167709264224, "grad_norm": 0.23523658514022827, "learning_rate": 5.298306728526714e-09, "loss": 0.806, "num_input_tokens_seen": 38502472, "step": 66745 }, { "epoch": 9.941912421805183, "grad_norm": 0.19867976009845734, "learning_rate": 5.165360351871984e-09, "loss": 0.8061, "num_input_tokens_seen": 38505096, "step": 66750 }, { "epoch": 9.942657134346142, "grad_norm": 0.24963133037090302, "learning_rate": 5.0341030245204536e-09, "loss": 0.7964, "num_input_tokens_seen": 38507944, "step": 66755 }, { "epoch": 9.943401846887102, "grad_norm": 0.208560049533844, "learning_rate": 4.904534755340029e-09, "loss": 0.8128, "num_input_tokens_seen": 38510856, "step": 66760 }, { "epoch": 9.944146559428061, "grad_norm": 0.33537372946739197, "learning_rate": 4.77665555309037e-09, "loss": 0.802, "num_input_tokens_seen": 38513864, "step": 66765 }, { "epoch": 9.94489127196902, "grad_norm": 0.2197631597518921, "learning_rate": 4.650465426409012e-09, "loss": 0.8011, "num_input_tokens_seen": 38516744, "step": 66770 }, { "epoch": 9.945635984509979, "grad_norm": 0.22784288227558136, "learning_rate": 4.525964383828018e-09, "loss": 0.7757, "num_input_tokens_seen": 38520808, "step": 66775 }, { "epoch": 9.946380697050937, "grad_norm": 0.2532939314842224, "learning_rate": 4.40315243375733e-09, "loss": 0.8004, "num_input_tokens_seen": 38523432, "step": 66780 }, { "epoch": 9.947125409591898, "grad_norm": 0.2194100320339203, "learning_rate": 4.282029584495861e-09, "loss": 0.7721, "num_input_tokens_seen": 38526536, "step": 66785 }, { "epoch": 9.947870122132857, "grad_norm": 0.17810285091400146, "learning_rate": 4.1625958442315096e-09, "loss": 0.8191, "num_input_tokens_seen": 38529640, "step": 66790 }, { "epoch": 9.948614834673815, "grad_norm": 0.22205428779125214, "learning_rate": 4.0448512210300436e-09, "loss": 0.7953, "num_input_tokens_seen": 38532488, "step": 66795 }, { "epoch": 9.949359547214776, "grad_norm": 0.1922808289527893, "learning_rate": 3.928795722854539e-09, "loss": 0.8058, "num_input_tokens_seen": 38535464, "step": 66800 }, { "epoch": 9.950104259755735, "grad_norm": 0.21072429418563843, "learning_rate": 3.81442935754317e-09, "loss": 0.8058, "num_input_tokens_seen": 38538216, "step": 66805 }, { "epoch": 9.950848972296694, "grad_norm": 0.305376797914505, "learning_rate": 3.701752132825864e-09, "loss": 0.8158, "num_input_tokens_seen": 38540872, "step": 66810 }, { "epoch": 9.951593684837652, "grad_norm": 0.21116212010383606, "learning_rate": 3.590764056315976e-09, "loss": 0.7742, "num_input_tokens_seen": 38543976, "step": 66815 }, { "epoch": 9.952338397378611, "grad_norm": 0.21047748625278473, "learning_rate": 3.481465135515838e-09, "loss": 0.8104, "num_input_tokens_seen": 38546792, "step": 66820 }, { "epoch": 9.953083109919572, "grad_norm": 0.2840014398097992, "learning_rate": 3.3738553778084324e-09, "loss": 0.7739, "num_input_tokens_seen": 38550120, "step": 66825 }, { "epoch": 9.95382782246053, "grad_norm": 0.18351809680461884, "learning_rate": 3.267934790465721e-09, "loss": 0.7779, "num_input_tokens_seen": 38553000, "step": 66830 }, { "epoch": 9.954572535001489, "grad_norm": 0.19713065028190613, "learning_rate": 3.1637033806486414e-09, "loss": 0.8012, "num_input_tokens_seen": 38556040, "step": 66835 }, { "epoch": 9.955317247542448, "grad_norm": 0.23152557015419006, "learning_rate": 3.061161155398784e-09, "loss": 0.7978, "num_input_tokens_seen": 38558792, "step": 66840 }, { "epoch": 9.956061960083408, "grad_norm": 0.2334810495376587, "learning_rate": 2.9603081216467156e-09, "loss": 0.828, "num_input_tokens_seen": 38561512, "step": 66845 }, { "epoch": 9.956806672624367, "grad_norm": 0.19862253963947296, "learning_rate": 2.8611442862036544e-09, "loss": 0.7879, "num_input_tokens_seen": 38564424, "step": 66850 }, { "epoch": 9.957551385165326, "grad_norm": 0.27894970774650574, "learning_rate": 2.763669655775347e-09, "loss": 0.7988, "num_input_tokens_seen": 38567016, "step": 66855 }, { "epoch": 9.958296097706285, "grad_norm": 0.22808687388896942, "learning_rate": 2.667884236945417e-09, "loss": 0.8135, "num_input_tokens_seen": 38569832, "step": 66860 }, { "epoch": 9.959040810247245, "grad_norm": 0.23274391889572144, "learning_rate": 2.5737880361892397e-09, "loss": 0.7869, "num_input_tokens_seen": 38572776, "step": 66865 }, { "epoch": 9.959785522788204, "grad_norm": 0.19590818881988525, "learning_rate": 2.4813810598628416e-09, "loss": 0.7837, "num_input_tokens_seen": 38575944, "step": 66870 }, { "epoch": 9.960530235329163, "grad_norm": 0.2476150393486023, "learning_rate": 2.3906633142140035e-09, "loss": 0.7942, "num_input_tokens_seen": 38578600, "step": 66875 }, { "epoch": 9.961274947870121, "grad_norm": 0.3223850727081299, "learning_rate": 2.3016348053711558e-09, "loss": 0.7932, "num_input_tokens_seen": 38581704, "step": 66880 }, { "epoch": 9.962019660411082, "grad_norm": 0.21576634049415588, "learning_rate": 2.2142955393489316e-09, "loss": 0.8251, "num_input_tokens_seen": 38584328, "step": 66885 }, { "epoch": 9.96276437295204, "grad_norm": 0.3366986811161041, "learning_rate": 2.1286455220537182e-09, "loss": 0.793, "num_input_tokens_seen": 38587240, "step": 66890 }, { "epoch": 9.963509085493, "grad_norm": 0.2247239351272583, "learning_rate": 2.044684759269777e-09, "loss": 0.7793, "num_input_tokens_seen": 38590312, "step": 66895 }, { "epoch": 9.964253798033958, "grad_norm": 0.25096452236175537, "learning_rate": 1.962413256670348e-09, "loss": 0.8033, "num_input_tokens_seen": 38593096, "step": 66900 }, { "epoch": 9.964998510574919, "grad_norm": 0.2382669746875763, "learning_rate": 1.881831019817648e-09, "loss": 0.8171, "num_input_tokens_seen": 38595816, "step": 66905 }, { "epoch": 9.965743223115878, "grad_norm": 0.24558357894420624, "learning_rate": 1.802938054154546e-09, "loss": 0.7873, "num_input_tokens_seen": 38598568, "step": 66910 }, { "epoch": 9.966487935656836, "grad_norm": 0.22090736031532288, "learning_rate": 1.7257343650156632e-09, "loss": 0.8125, "num_input_tokens_seen": 38601384, "step": 66915 }, { "epoch": 9.967232648197795, "grad_norm": 0.23176616430282593, "learning_rate": 1.6502199576134968e-09, "loss": 0.7974, "num_input_tokens_seen": 38604232, "step": 66920 }, { "epoch": 9.967977360738756, "grad_norm": 0.18571987748146057, "learning_rate": 1.576394837055073e-09, "loss": 0.8376, "num_input_tokens_seen": 38607240, "step": 66925 }, { "epoch": 9.968722073279714, "grad_norm": 0.20589886605739594, "learning_rate": 1.5042590083280683e-09, "loss": 0.7925, "num_input_tokens_seen": 38610600, "step": 66930 }, { "epoch": 9.969466785820673, "grad_norm": 0.15855994820594788, "learning_rate": 1.433812476306362e-09, "loss": 0.8017, "num_input_tokens_seen": 38613512, "step": 66935 }, { "epoch": 9.970211498361632, "grad_norm": 0.6698435544967651, "learning_rate": 1.3650552457500353e-09, "loss": 0.8159, "num_input_tokens_seen": 38616616, "step": 66940 }, { "epoch": 9.970956210902592, "grad_norm": 0.17930512130260468, "learning_rate": 1.2979873213081473e-09, "loss": 0.8042, "num_input_tokens_seen": 38619496, "step": 66945 }, { "epoch": 9.971700923443551, "grad_norm": 0.23697197437286377, "learning_rate": 1.2326087075104076e-09, "loss": 0.789, "num_input_tokens_seen": 38622440, "step": 66950 }, { "epoch": 9.97244563598451, "grad_norm": 0.28445908427238464, "learning_rate": 1.1689194087727285e-09, "loss": 0.7961, "num_input_tokens_seen": 38625448, "step": 66955 }, { "epoch": 9.973190348525469, "grad_norm": 0.30089500546455383, "learning_rate": 1.1069194294055507e-09, "loss": 0.8158, "num_input_tokens_seen": 38628360, "step": 66960 }, { "epoch": 9.973935061066427, "grad_norm": 0.23442751169204712, "learning_rate": 1.0466087735916396e-09, "loss": 0.8021, "num_input_tokens_seen": 38630824, "step": 66965 }, { "epoch": 9.974679773607388, "grad_norm": 0.223042294383049, "learning_rate": 9.879874454110648e-10, "loss": 0.805, "num_input_tokens_seen": 38633672, "step": 66970 }, { "epoch": 9.975424486148347, "grad_norm": 0.2620997428894043, "learning_rate": 9.310554488245471e-10, "loss": 0.7903, "num_input_tokens_seen": 38636328, "step": 66975 }, { "epoch": 9.976169198689306, "grad_norm": 0.26411303877830505, "learning_rate": 8.758127876762335e-10, "loss": 0.8198, "num_input_tokens_seen": 38639016, "step": 66980 }, { "epoch": 9.976913911230266, "grad_norm": 0.25348755717277527, "learning_rate": 8.222594657020244e-10, "loss": 0.8052, "num_input_tokens_seen": 38642056, "step": 66985 }, { "epoch": 9.977658623771225, "grad_norm": 0.22953322529792786, "learning_rate": 7.703954865212471e-10, "loss": 0.8069, "num_input_tokens_seen": 38645192, "step": 66990 }, { "epoch": 9.978403336312184, "grad_norm": 0.25463607907295227, "learning_rate": 7.202208536366551e-10, "loss": 0.7753, "num_input_tokens_seen": 38648200, "step": 66995 }, { "epoch": 9.979148048853142, "grad_norm": 0.216519296169281, "learning_rate": 6.717355704427553e-10, "loss": 0.777, "num_input_tokens_seen": 38651112, "step": 67000 }, { "epoch": 9.979892761394101, "grad_norm": 0.21888233721256256, "learning_rate": 6.249396402091545e-10, "loss": 0.8114, "num_input_tokens_seen": 38654216, "step": 67005 }, { "epoch": 9.980637473935062, "grad_norm": 0.2482878416776657, "learning_rate": 5.79833066102764e-10, "loss": 0.8056, "num_input_tokens_seen": 38657448, "step": 67010 }, { "epoch": 9.98138218647602, "grad_norm": 0.43418216705322266, "learning_rate": 5.364158511739215e-10, "loss": 0.8029, "num_input_tokens_seen": 38660392, "step": 67015 }, { "epoch": 9.98212689901698, "grad_norm": 0.19608727097511292, "learning_rate": 4.946879983508401e-10, "loss": 0.7936, "num_input_tokens_seen": 38663176, "step": 67020 }, { "epoch": 9.982871611557938, "grad_norm": 0.15953238308429718, "learning_rate": 4.5464951045626204e-10, "loss": 0.792, "num_input_tokens_seen": 38666088, "step": 67025 }, { "epoch": 9.983616324098898, "grad_norm": 0.21006396412849426, "learning_rate": 4.163003901963558e-10, "loss": 0.817, "num_input_tokens_seen": 38668840, "step": 67030 }, { "epoch": 9.984361036639857, "grad_norm": 0.2052076905965805, "learning_rate": 3.796406401634922e-10, "loss": 0.7985, "num_input_tokens_seen": 38671592, "step": 67035 }, { "epoch": 9.985105749180816, "grad_norm": 0.18208058178424835, "learning_rate": 3.4467026283069305e-10, "loss": 0.8066, "num_input_tokens_seen": 38674472, "step": 67040 }, { "epoch": 9.985850461721775, "grad_norm": 0.5576798915863037, "learning_rate": 3.1138926056550886e-10, "loss": 0.8165, "num_input_tokens_seen": 38677480, "step": 67045 }, { "epoch": 9.986595174262735, "grad_norm": 0.20169679820537567, "learning_rate": 2.7979763561614137e-10, "loss": 0.7921, "num_input_tokens_seen": 38680520, "step": 67050 }, { "epoch": 9.987339886803694, "grad_norm": 0.26243117451667786, "learning_rate": 2.498953901142187e-10, "loss": 0.7933, "num_input_tokens_seen": 38683496, "step": 67055 }, { "epoch": 9.988084599344653, "grad_norm": 0.1405549943447113, "learning_rate": 2.216825260858979e-10, "loss": 0.8107, "num_input_tokens_seen": 38686216, "step": 67060 }, { "epoch": 9.988829311885612, "grad_norm": 0.2493140548467636, "learning_rate": 1.951590454324359e-10, "loss": 0.7906, "num_input_tokens_seen": 38689032, "step": 67065 }, { "epoch": 9.989574024426572, "grad_norm": 0.2302541434764862, "learning_rate": 1.7032494994961846e-10, "loss": 0.8173, "num_input_tokens_seen": 38692040, "step": 67070 }, { "epoch": 9.99031873696753, "grad_norm": 0.3365922272205353, "learning_rate": 1.4718024131110674e-10, "loss": 0.808, "num_input_tokens_seen": 38695016, "step": 67075 }, { "epoch": 9.99106344950849, "grad_norm": 0.2561247646808624, "learning_rate": 1.2572492108786638e-10, "loss": 0.7942, "num_input_tokens_seen": 38697736, "step": 67080 }, { "epoch": 9.991808162049448, "grad_norm": 0.21966920793056488, "learning_rate": 1.0595899072318727e-10, "loss": 0.7874, "num_input_tokens_seen": 38701128, "step": 67085 }, { "epoch": 9.992552874590409, "grad_norm": 0.227018803358078, "learning_rate": 8.788245155766372e-11, "loss": 0.8215, "num_input_tokens_seen": 38703880, "step": 67090 }, { "epoch": 9.993297587131368, "grad_norm": 0.22430330514907837, "learning_rate": 7.149530480976552e-11, "loss": 0.7943, "num_input_tokens_seen": 38707080, "step": 67095 }, { "epoch": 9.994042299672326, "grad_norm": 0.24544598162174225, "learning_rate": 5.679755158971567e-11, "loss": 0.8028, "num_input_tokens_seen": 38709864, "step": 67100 }, { "epoch": 9.994787012213285, "grad_norm": 0.1964349001646042, "learning_rate": 4.3789192888388233e-11, "loss": 0.7848, "num_input_tokens_seen": 38712616, "step": 67105 }, { "epoch": 9.995531724754246, "grad_norm": 0.21350707113742828, "learning_rate": 3.247022958285939e-11, "loss": 0.7963, "num_input_tokens_seen": 38715240, "step": 67110 }, { "epoch": 9.996276437295204, "grad_norm": 0.1501658707857132, "learning_rate": 2.2840662441958594e-11, "loss": 0.8164, "num_input_tokens_seen": 38717896, "step": 67115 }, { "epoch": 9.997021149836163, "grad_norm": 0.23343180119991302, "learning_rate": 1.490049211516631e-11, "loss": 0.7993, "num_input_tokens_seen": 38721096, "step": 67120 }, { "epoch": 9.997765862377122, "grad_norm": 0.20540215075016022, "learning_rate": 8.649719140940703e-12, "loss": 0.8085, "num_input_tokens_seen": 38724328, "step": 67125 }, { "epoch": 9.998510574918082, "grad_norm": 0.22848814725875854, "learning_rate": 4.088343935615413e-12, "loss": 0.7857, "num_input_tokens_seen": 38727112, "step": 67130 }, { "epoch": 9.999255287459041, "grad_norm": 0.19356849789619446, "learning_rate": 1.2163668156039976e-12, "loss": 0.7922, "num_input_tokens_seen": 38729832, "step": 67135 }, { "epoch": 10.0, "grad_norm": 0.459156334400177, "learning_rate": 3.378796686881458e-14, "loss": 0.7855, "num_input_tokens_seen": 38732208, "step": 67140 }, { "epoch": 10.0, "eval_loss": 0.8028939366340637, "eval_runtime": 45.1763, "eval_samples_per_second": 66.052, "eval_steps_per_second": 16.513, "num_input_tokens_seen": 38732208, "step": 67140 }, { "epoch": 10.0, "num_input_tokens_seen": 38732208, "step": 67140, "total_flos": 1.7440938205214147e+18, "train_loss": 1.0262119300989725, "train_runtime": 10175.179, "train_samples_per_second": 26.391, "train_steps_per_second": 6.598 } ], "logging_steps": 5, "max_steps": 67140, "num_input_tokens_seen": 38732208, "num_train_epochs": 10, "save_steps": 3357, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7440938205214147e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }